From 3798bbae98cb82e13df18ddf095488b98afe0ddd Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Tue, 21 Nov 2023 17:32:00 +0200 Subject: [PATCH] New upstream release --- INSTALL.md | 2 +- Makefile | 28 +- Makefile.compiler | 8 +- bcachefs.c | 33 +- build.nix | 34 +- cmd_data.c | 2 +- cmd_device.c | 33 +- cmd_dump.c | 33 +- cmd_format.c | 31 +- cmd_fusemount.c | 199 +- cmd_key.c | 5 +- cmd_kill_btree_node.c | 13 +- cmd_list_journal.c | 2 +- cmd_migrate.c | 25 +- cmd_version.c | 2 +- cmds.h | 10 +- crypto.c | 4 +- debian/changelog | 6 + default.nix | 16 +- flake.lock | 18 +- flake.nix | 25 +- fsck.bcachefs | 4 - include/linux/atomic.h | 12 + include/linux/bit_spinlock.h | 62 +- include/linux/blkdev.h | 49 +- include/linux/closure.h | 12 +- include/linux/compiler.h | 5 + include/linux/generic-radix-tree.h | 4 +- include/linux/kernel.h | 3 + include/linux/list.h | 11 + include/linux/page.h | 3 + include/linux/rcupdate.h | 2 +- include/linux/sched.h | 15 +- include/linux/six.h | 388 -- include/linux/slab.h | 3 +- libbcachefs.c | 101 +- libbcachefs.h | 6 +- libbcachefs/acl.c | 103 +- libbcachefs/acl.h | 6 +- libbcachefs/alloc_background.c | 445 ++- libbcachefs/alloc_background.h | 12 +- libbcachefs/alloc_foreground.c | 114 +- libbcachefs/alloc_foreground.h | 2 +- libbcachefs/alloc_types.h | 4 +- libbcachefs/backpointers.c | 271 +- libbcachefs/backpointers.h | 11 +- libbcachefs/bbpos.h | 17 +- libbcachefs/bcachefs.h | 72 +- libbcachefs/bcachefs_format.h | 285 +- libbcachefs/bkey.c | 187 +- libbcachefs/bkey.h | 38 +- libbcachefs/bkey_methods.c | 186 +- libbcachefs/bkey_methods.h | 23 +- libbcachefs/bkey_sort.c | 6 +- libbcachefs/bkey_sort.h | 16 +- libbcachefs/bset.c | 27 +- libbcachefs/btree_cache.c | 75 +- libbcachefs/btree_cache.h | 5 +- libbcachefs/btree_gc.c | 304 +- libbcachefs/btree_gc.h | 1 + libbcachefs/btree_io.c | 371 +- libbcachefs/btree_io.h | 6 +- libbcachefs/btree_iter.c | 329 +- libbcachefs/btree_iter.h | 117 +- libbcachefs/btree_key_cache.c | 79 +- libbcachefs/btree_locking.c | 46 +- libbcachefs/btree_locking.h | 23 +- libbcachefs/btree_types.h | 120 +- libbcachefs/btree_update.h | 86 +- libbcachefs/btree_update_interior.c | 215 +- libbcachefs/btree_update_interior.h | 7 +- libbcachefs/btree_update_leaf.c | 2107 ----------- libbcachefs/btree_write_buffer.c | 63 +- libbcachefs/buckets.c | 202 +- libbcachefs/buckets.h | 100 +- libbcachefs/buckets_waiting_for_journal.c | 2 +- libbcachefs/chardev.c | 59 +- libbcachefs/chardev.h | 2 +- libbcachefs/checksum.c | 151 +- libbcachefs/checksum.h | 10 +- libbcachefs/compress.c | 37 +- libbcachefs/compress.h | 36 +- libbcachefs/counters.c | 8 +- libbcachefs/darray.h | 34 +- libbcachefs/data_update.c | 75 +- libbcachefs/data_update.h | 3 +- libbcachefs/debug.c | 65 +- libbcachefs/dirent.c | 161 +- libbcachefs/dirent.h | 7 +- libbcachefs/disk_groups.c | 195 +- libbcachefs/disk_groups.h | 7 +- libbcachefs/ec.c | 129 +- libbcachefs/ec.h | 6 +- libbcachefs/errcode.c | 9 +- libbcachefs/errcode.h | 24 +- libbcachefs/error.c | 33 +- libbcachefs/error.h | 90 +- libbcachefs/extents.c | 438 ++- libbcachefs/extents.h | 66 +- libbcachefs/fs-common.c | 2 +- libbcachefs/fs-io.c | 3221 +---------------- libbcachefs/fs-io.h | 170 +- libbcachefs/fs-ioctl.c | 64 +- libbcachefs/fs-ioctl.h | 34 +- libbcachefs/fs.c | 266 +- libbcachefs/fs.h | 5 +- libbcachefs/fsck.c | 596 +-- libbcachefs/fsck.h | 1 + libbcachefs/inode.c | 377 +- libbcachefs/inode.h | 31 +- libbcachefs/io.c | 3059 ---------------- libbcachefs/io.h | 202 -- libbcachefs/io_types.h | 165 - libbcachefs/journal.c | 131 +- libbcachefs/journal.h | 133 +- libbcachefs/journal_io.c | 532 +-- libbcachefs/journal_io.h | 3 +- libbcachefs/journal_reclaim.c | 139 +- libbcachefs/journal_reclaim.h | 18 +- libbcachefs/journal_sb.c | 22 +- libbcachefs/journal_seq_blacklist.c | 28 +- libbcachefs/journal_types.h | 33 +- libbcachefs/lru.c | 24 +- libbcachefs/lru.h | 2 +- libbcachefs/migrate.c | 31 +- libbcachefs/move.c | 456 +-- libbcachefs/move.h | 65 +- libbcachefs/move_types.h | 8 +- libbcachefs/movinggc.c | 98 +- libbcachefs/nocow_locking.c | 37 +- libbcachefs/nocow_locking.h | 1 + libbcachefs/opts.c | 13 +- libbcachefs/opts.h | 5 +- libbcachefs/printbuf.c | 70 +- libbcachefs/quota.c | 42 +- libbcachefs/quota.h | 2 +- libbcachefs/rebalance.c | 558 +-- libbcachefs/rebalance.h | 9 +- libbcachefs/rebalance_types.h | 31 +- libbcachefs/recovery.c | 976 +---- libbcachefs/recovery.h | 65 +- libbcachefs/recovery_types.h | 7 +- libbcachefs/reflink.c | 105 +- libbcachefs/reflink.h | 6 +- libbcachefs/replicas.c | 39 +- libbcachefs/str_hash.h | 25 +- libbcachefs/subvolume.c | 1395 +------ libbcachefs/subvolume.h | 227 +- libbcachefs/subvolume_types.h | 2 +- libbcachefs/super-io.c | 569 +-- libbcachefs/super-io.h | 78 +- libbcachefs/super.c | 313 +- libbcachefs/super.h | 214 -- libbcachefs/super_types.h | 15 +- libbcachefs/sysfs.c | 96 +- libbcachefs/tests.c | 361 +- libbcachefs/trace.c | 3 +- libbcachefs/trace.h | 161 +- libbcachefs/util.c | 163 +- libbcachefs/util.h | 93 +- libbcachefs/varint.c | 25 +- libbcachefs/vstructs.h | 6 +- libbcachefs/xattr.c | 110 +- libbcachefs/xattr.h | 2 +- linux/blkdev.c | 25 +- linux/closure.c | 9 +- linux/shrinker.c | 16 +- linux/six.c | 893 ----- mkfs.bcachefs | 4 - mount.bcachefs | 4 - packaging/bcachefs-tools.spec | 5 +- qcow2.c | 2 +- rust-src/Cargo.lock | 611 ++-- rust-src/Cargo.toml | 4 +- rust-src/bch_bindgen/Cargo.lock | 505 +-- rust-src/bch_bindgen/Cargo.toml | 4 +- rust-src/bch_bindgen/build.rs | 1 + rust-src/bch_bindgen/src/bcachefs.rs | 2 +- rust-src/bch_bindgen/src/bkey.rs | 4 + rust-src/bch_bindgen/src/btree.rs | 15 +- rust-src/bch_bindgen/src/lib.rs | 4 +- .../bch_bindgen/src/libbcachefs_wrapper.h | 8 +- rust-src/src/cmd_list.rs | 22 +- rust-src/src/cmd_mount.rs | 59 +- rust-src/src/key.rs | 55 +- rust-src/src/lib.rs | 29 + tools-util.c | 33 +- tools-util.h | 7 +- 188 files changed, 7851 insertions(+), 19147 deletions(-) delete mode 100755 fsck.bcachefs delete mode 100644 include/linux/six.h delete mode 100644 libbcachefs/btree_update_leaf.c delete mode 100644 libbcachefs/io.c delete mode 100644 libbcachefs/io.h delete mode 100644 libbcachefs/io_types.h delete mode 100644 linux/six.c delete mode 100755 mkfs.bcachefs delete mode 100755 mount.bcachefs diff --git a/INSTALL.md b/INSTALL.md index 94b2877..370fb8d 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -16,7 +16,7 @@ Build dependencies: * zlib1g In addition a recent Rust toolchain is required (rustc, cargo), either by using -[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.64) +[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.65) is available. Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with diff --git a/Makefile b/Makefile index c77c0c5..61a6245 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ +VERSION=1.3.3 + PREFIX?=/usr/local PKG_CONFIG?=pkg-config INSTALL=install +LN=ln ifeq ("$(origin V)", "command line") BUILD_VERBOSE = $(V) @@ -34,14 +37,17 @@ CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ $(EXTRA_CFLAGS) LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS) -CARGO_ARGS= +ifdef CARGO_TOOLCHAIN_VERSION + CARGO_TOOLCHAIN = +$(CARGO_TOOLCHAIN_VERSION) +endif + +CARGO_ARGS=${CARGO_TOOLCHAIN} CARGO=cargo $(CARGO_ARGS) CARGO_PROFILE=release # CARGO_PROFILE=debug CARGO_BUILD_ARGS=--$(CARGO_PROFILE) CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS) -VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit) include Makefile.compiler @@ -148,12 +154,15 @@ install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs install: bcachefs $(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR) - $(INSTALL) -m0755 fsck.bcachefs $(DESTDIR)$(ROOT_SBINDIR) - $(INSTALL) -m0755 mkfs.bcachefs $(DESTDIR)$(ROOT_SBINDIR) - $(INSTALL) -m0755 mount.bcachefs $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/ $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT) $(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK) + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK) echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK) @@ -178,6 +187,11 @@ bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex doc: bcachefs-principles-of-operation.pdf +.PHONY: cargo-update-msrv +cargo-update-msrv: + cargo +nightly generate-lockfile --manifest-path rust-src/Cargo.toml -Zmsrv-policy + cargo +nightly generate-lockfile --manifest-path rust-src/bch_bindgen/Cargo.toml -Zmsrv-policy + .PHONY: update-bcachefs-sources update-bcachefs-sources: git rm -rf --ignore-unmatch libbcachefs @@ -192,10 +206,6 @@ update-bcachefs-sources: git add include/linux/xxhash.h cp $(LINUX_DIR)/lib/xxhash.c linux/ git add linux/xxhash.c - cp $(LINUX_DIR)/kernel/locking/six.c linux/ - git add linux/six.c - cp $(LINUX_DIR)/include/linux/six.h include/linux/ - git add include/linux/six.h cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/ git add include/linux/list_nulls.h cp $(LINUX_DIR)/include/linux/poison.h include/linux/ diff --git a/Makefile.compiler b/Makefile.compiler index 7aa1fbc..8fcb427 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -32,13 +32,13 @@ try-run = $(shell set -e; \ # Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,) as-option = $(call try-run,\ - $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) # as-instr # Usage: aflags-y += $(call as-instr,instr,option1,option2) as-instr = $(call try-run,\ - printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) + printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) # __cc-option # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) @@ -72,7 +72,3 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1) # ld-option # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) - -# ld-ifversion -# Usage: $(call ld-ifversion, -ge, 22252, y) -ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4)) diff --git a/bcachefs.c b/bcachefs.c index a3fe6d8..4efe29e 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -84,6 +84,7 @@ static void usage(void) "\n" "Commands for operating on files in a bcachefs filesystem:\n" " setattr Set various per file attributes\n" + "\n" "Debug:\n" "These commands work on offline, unmounted filesystems\n" " dump Dump filesystem metadata to a qcow2 image\n" @@ -92,7 +93,13 @@ static void usage(void) #endif " list_journal List contents of journal\n" "\n" + "FUSE:\n" + " fusemount Mount a filesystem via FUSE\n" + "\n" "Miscellaneous:\n" +#ifndef BCACHEFS_NO_RUST + " completions Generate shell completions\n" +#endif " version Display the version of the invoked bcachefs tool\n"); } @@ -184,6 +191,24 @@ int main(int argc, char *argv[]) full_cmd = argv[0]; + /* Are we being called via a symlink? */ + + if (strstr(full_cmd, "mkfs")) + return cmd_format(argc, argv); + + if (strstr(full_cmd, "fsck")) + return cmd_fsck(argc, argv); + +#ifdef BCACHEFS_FUSE + if (strstr(full_cmd, "mount.fuse")) + return cmd_fusemount(argc, argv); +#endif + +#ifndef BCACHEFS_NO_RUST + if (strstr(full_cmd, "mount")) + return cmd_mount(argc, argv); +#endif + setvbuf(stdout, NULL, _IOLBF, 0); char *cmd = pop_cmd(&argc, argv); @@ -249,10 +274,10 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "setattr")) return cmd_setattr(argc, argv); #ifndef BCACHEFS_NO_RUST - if (!strcmp(cmd, "mount")) { - cmd_mount(argc, argv); - return 0; - } + if (!strcmp(cmd, "mount")) + return cmd_mount(argc, argv); + if (strstr(cmd, "completions")) + return cmd_completions(argc, argv); #endif #ifdef BCACHEFS_FUSE diff --git a/build.nix b/build.nix index 5cf07de..e05dad4 100644 --- a/build.nix +++ b/build.nix @@ -1,28 +1,10 @@ -{ lib -, stdenv -, pkg-config -, attr -, libuuid -, libsodium -, keyutils -, liburcu -, zlib -, libaio -, udev -, zstd -, lz4 -, nix-gitignore -, rustPlatform -, rustc -, cargo - }: - +{ lib, stdenv, pkg-config, attr, libuuid, libsodium, keyutils, liburcu, zlib +, libaio, udev, zstd, lz4, nix-gitignore, rustPlatform, rustc, cargo, }: let - src = nix-gitignore.gitignoreSource [] ./. ; + src = nix-gitignore.gitignoreSource [ ] ./.; commit = lib.strings.substring 0 7 (builtins.readFile ./.bcachefs_revision); version = "git-${commit}"; - in stdenv.mkDerivation { inherit src version; @@ -61,12 +43,14 @@ in stdenv.mkDerivation { }; }; - makeFlags = [ - "PREFIX=${placeholder "out"}" - "VERSION=${commit}" - ]; + makeFlags = [ "DESTDIR=${placeholder "out"}" "PREFIX=" "VERSION=${commit}" ]; dontStrip = true; checkPhase = "./bcachefs version"; doCheck = true; + + meta = { + mainProgram = "bcachefs"; + license = lib.licenses.gpl2Only; + }; } diff --git a/cmd_data.c b/cmd_data.c index 160eb91..6d70988 100644 --- a/cmd_data.c +++ b/cmd_data.c @@ -103,7 +103,7 @@ int cmd_data_job(int argc, char *argv[]) switch (opt) { case 'b': op.start_btree = read_string_list_or_die(optarg, - bch2_btree_ids, "btree id"); + __bch2_btree_ids, "btree id"); op.end_btree = op.start_btree; break; case 's': diff --git a/cmd_device.c b/cmd_device.c index c59d370..1cb31ab 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -16,6 +16,7 @@ #include "libbcachefs/bcachefs_ioctl.h" #include "libbcachefs/errcode.h" #include "libbcachefs/journal.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super-io.h" #include "cmds.h" #include "libbcachefs.h" @@ -112,7 +113,9 @@ int cmd_device_add(int argc, char *argv[]) struct bchfs_handle fs = bcache_fs_open(fs_path); - dev_opts.fd = open_for_format(dev_opts.path, force); + int ret = open_for_format(&dev_opts, force); + if (ret) + die("Error opening %s: %s", dev_opts.path, strerror(-ret)); struct bch_opt_strs fs_opt_strs; memset(&fs_opt_strs, 0, sizeof(fs_opt_strs)); @@ -129,8 +132,8 @@ int cmd_device_add(int argc, char *argv[]) format_opts, &dev_opts, 1); free(sb); - fsync(dev_opts.fd); - close(dev_opts.fd); + fsync(dev_opts.bdev->bd_buffered_fd); + close(dev_opts.bdev->bd_buffered_fd); bchu_disk_add(fs, dev_opts.path); return 0; @@ -413,7 +416,7 @@ int cmd_device_set_state(int argc, char *argv[]) if (ret) die("error opening %s: %s", dev_str, bch2_err_str(ret)); - struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx; + struct bch_member *m = bch2_members_v2_get_mut(sb.sb, sb.sb->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); @@ -483,7 +486,7 @@ int cmd_device_resize(int argc, char *argv[]) char *size_arg = arg_pop(); if (!size_arg) - size = get_size(dev, dev_fd); + size = get_size(dev_fd); else if (bch2_strtoull_h(size_arg, &size)) die("invalid size"); @@ -509,16 +512,11 @@ int cmd_device_resize(int argc, char *argv[]) if (idx >= sb->nr_devices) die("error reading superblock: dev idx >= sb->nr_devices"); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - if (!mi) - die("error reading superblock: no member info"); + struct bch_member m = bch2_sb_member_get(sb, idx); - /* could also just read this out of sysfs... meh */ - struct bch_member *m = mi->members + idx; + u64 nbuckets = size / le16_to_cpu(m.bucket_size); - u64 nbuckets = size / le16_to_cpu(m->bucket_size); - - if (nbuckets < le64_to_cpu(m->nbuckets)) + if (nbuckets < le64_to_cpu(m.nbuckets)) die("Shrinking not supported yet"); printf("resizing %s to %llu buckets\n", dev, nbuckets); @@ -615,14 +613,9 @@ int cmd_device_resize_journal(int argc, char *argv[]) if (idx >= sb->nr_devices) die("error reading superblock: dev idx >= sb->nr_devices"); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - if (!mi) - die("error reading superblock: no member info"); - - /* could also just read this out of sysfs... meh */ - struct bch_member *m = mi->members + idx; + struct bch_member m = bch2_sb_member_get(sb, idx); - u64 nbuckets = size / le16_to_cpu(m->bucket_size); + u64 nbuckets = size / le16_to_cpu(m.bucket_size); printf("resizing journal on %s to %llu buckets\n", dev, nbuckets); bchu_disk_resize_journal(fs, idx, nbuckets); diff --git a/cmd_dump.c b/cmd_dump.c index cc25a6a..0d34923 100644 --- a/cmd_dump.c +++ b/cmd_dump.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -12,6 +13,7 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/error.h" #include "libbcachefs/extents.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super.h" static void dump_usage(void) @@ -21,9 +23,9 @@ static void dump_usage(void) "\n" "Options:\n" " -o output Output qcow2 image(s)\n" - " -f Force; overwrite when needed\n" - " -j Dump entire journal, not just dirty entries\n" - " -h Display this help and exit\n" + " -f, --force Force; overwrite when needed\n" + " --nojournal Don't dump entire journal, just dirty entries\n" + " -h, --help Display this help and exit\n" "Report bugs to "); } @@ -59,13 +61,11 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, for (i = 0; i < BTREE_ID_NR; i++) { const struct bch_extent_ptr *ptr; struct bkey_ptrs_c ptrs; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct btree *b; - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { + __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { struct btree_node_iter iter; struct bkey u; struct bkey_s_c k; @@ -95,8 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, btree_bytes(c)); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); } qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data, @@ -106,20 +106,29 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, int cmd_dump(int argc, char *argv[]) { + static const struct option longopts[] = { + { "force", no_argument, NULL, 'f' }, + { "nojournal", no_argument, NULL, 'j' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, 'h' }, + { NULL } + }; struct bch_opts opts = bch2_opts_empty(); struct bch_dev *ca; char *out = NULL; unsigned i, nr_devices = 0; - bool force = false, entire_journal = false; + bool force = false, entire_journal = true; int fd, opt; + opt_set(opts, read_only, true); opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); opt_set(opts, degraded, true); opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_FIX_no); - while ((opt = getopt(argc, argv, "o:fjvh")) != -1) + while ((opt = getopt_long(argc, argv, "o:fvh", + longopts, NULL)) != -1) switch (opt) { case 'o': out = optarg; @@ -128,7 +137,7 @@ int cmd_dump(int argc, char *argv[]) force = true; break; case 'j': - entire_journal = true; + entire_journal = false; break; case 'v': opt_set(opts, verbose, true); diff --git a/cmd_format.c b/cmd_format.c index 26a1cd9..f0a4b6a 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -119,6 +119,7 @@ int cmd_format(int argc, char *argv[]) struct format_opts opts = format_opts_default(); struct dev_opts dev_opts = dev_opts_default(), *dev; bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false; + bool unconsumed_dev_option = false; unsigned v; int opt; @@ -162,6 +163,7 @@ int cmd_format(int argc, char *argv[]) case O_fs_size: if (bch2_strtoull_h(optarg, &dev_opts.size)) die("invalid filesystem size"); + unconsumed_dev_option = true; break; case O_superblock_size: if (bch2_strtouint_h(optarg, &opts.superblock_size)) @@ -172,23 +174,28 @@ int cmd_format(int argc, char *argv[]) case O_bucket_size: if (bch2_strtoull_h(optarg, &dev_opts.bucket_size)) die("bad bucket_size %s", optarg); + unconsumed_dev_option = true; break; case O_label: case 'l': dev_opts.label = optarg; + unconsumed_dev_option = true; break; case O_discard: dev_opts.discard = true; + unconsumed_dev_option = true; break; case O_data_allowed: dev_opts.data_allowed = read_flag_list_or_die(optarg, bch2_data_types, "data type"); + unconsumed_dev_option = true; break; case O_durability: if (kstrtouint(optarg, 10, &dev_opts.durability) || dev_opts.durability > BCH_REPLICAS_MAX) die("invalid durability"); + unconsumed_dev_option = true; break; case O_version: if (kstrtouint(optarg, 10, &opts.version)) @@ -202,6 +209,7 @@ int cmd_format(int argc, char *argv[]) dev_opts.path = optarg; darray_push(&devices, dev_opts); dev_opts.size = 0; + unconsumed_dev_option = false; break; case O_quiet: case 'q': @@ -219,6 +227,9 @@ int cmd_format(int argc, char *argv[]) break; } + if (unconsumed_dev_option) + die("Options for devices apply to subsequent devices; got a device option with no device"); + if (opts.version != bcachefs_metadata_version_current) initialize = false; @@ -230,8 +241,11 @@ int cmd_format(int argc, char *argv[]) initialize = false; } - darray_for_each(devices, dev) - dev->fd = open_for_format(dev->path, force); + darray_for_each(devices, dev) { + int ret = open_for_format(dev, force); + if (ret) + die("Error opening %s: %s", dev_opts.path, strerror(-ret)); + } struct bch_sb *sb = bch2_format(fs_opt_strs, @@ -245,7 +259,7 @@ int cmd_format(int argc, char *argv[]) buf.human_readable_units = true; - bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members); + bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members_v2); printf("%s", buf.buf); printbuf_exit(&buf); @@ -305,8 +319,9 @@ int cmd_show_super(int argc, char *argv[]) { "help", 0, NULL, 'h' }, { NULL } }; - unsigned fields = 1 << BCH_SB_FIELD_members; + unsigned fields = 0; bool print_layout = false; + bool print_default_fields = true; int opt; while ((opt = getopt_long(argc, argv, "f:lh", longopts, NULL)) != -1) @@ -316,6 +331,7 @@ int cmd_show_super(int argc, char *argv[]) ? ~0 : read_flag_list_or_die(optarg, bch2_sb_fields, "superblock field"); + print_default_fields = false; break; case 'l': print_layout = true; @@ -342,6 +358,13 @@ int cmd_show_super(int argc, char *argv[]) if (ret) die("Error opening %s: %s", dev, bch2_err_str(ret)); + if (print_default_fields) { + fields |= bch2_sb_field_get(sb.sb, members_v2) + ? 1 << BCH_SB_FIELD_members_v2 + : 1 << BCH_SB_FIELD_members_v1; + fields |= BCH_SB_FIELD_errors; + } + struct printbuf buf = PRINTBUF; buf.human_readable_units = true; diff --git a/cmd_fusemount.c b/cmd_fusemount.c index 4470f83..46dbb9d 100644 --- a/cmd_fusemount.c +++ b/cmd_fusemount.c @@ -21,7 +21,8 @@ #include "libbcachefs/error.h" #include "libbcachefs/fs-common.h" #include "libbcachefs/inode.h" -#include "libbcachefs/io.h" +#include "libbcachefs/io_read.h" +#include "libbcachefs/io_write.h" #include "libbcachefs/opts.h" #include "libbcachefs/super.h" @@ -33,9 +34,9 @@ /* XXX cut and pasted from fsck.c */ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static inline u64 map_root_ino(u64 ino) +static inline subvol_inum map_root_ino(u64 ino) { - return ino == 1 ? 4096 : ino; + return (subvol_inum) { 1, ino == 1 ? 4096 : ino }; } static inline u64 unmap_root_ino(u64 ino) @@ -92,19 +93,18 @@ static void bcachefs_fuse_destroy(void *arg) bch2_fs_stop(c); } -static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir, +static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir_ino, const char *name) { + subvol_inum dir = map_root_ino(dir_ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked bi; struct qstr qstr = QSTR(name); - u64 inum; + subvol_inum inum; int ret; fuse_log(FUSE_LOG_DEBUG, "fuse_lookup(dir=%llu name=%s)\n", - dir, name); - - dir = map_root_ino(dir); + dir.inum, name); ret = bch2_inode_find_by_inum(c, dir, &bi); if (ret) { @@ -114,8 +114,8 @@ static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir, struct bch_hash_info hash_info = bch2_hash_info_init(c, &bi); - inum = bch2_dirent_lookup(c, dir, &hash_info, &qstr); - if (!inum) { + ret = bch2_dirent_lookup(c, dir, &hash_info, &qstr, &inum); + if (ret) { struct fuse_entry_param e = { .attr_timeout = DBL_MAX, .entry_timeout = DBL_MAX, @@ -139,20 +139,17 @@ err: fuse_reply_err(req, -ret); } -static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t inum, +static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { + subvol_inum inum = map_root_ino(ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked bi; struct stat attr; - int ret; - fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n", - inum); + fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n", inum.inum); - inum = map_root_ino(inum); - - ret = bch2_inode_find_by_inum(c, inum, &bi); + int ret = bch2_inode_find_by_inum(c, inum, &bi); if (ret) { fuse_log(FUSE_LOG_DEBUG, "fuse_getattr error %i\n", ret); fuse_reply_err(req, -ret); @@ -165,28 +162,27 @@ static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t inum, fuse_reply_attr(req, &attr, DBL_MAX); } -static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum, +static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set, struct fuse_file_info *fi) { struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked inode_u; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; u64 now; int ret; - fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n", - inum, to_set); + subvol_inum inum = map_root_ino(ino); - inum = map_root_ino(inum); + fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n", inum.inum, to_set); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); now = bch2_current_time(c); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -208,15 +204,15 @@ retry: inode_u.bi_mtime = now; /* TODO: CTIME? */ - ret = bch2_inode_write(&trans, &iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_inode_write(trans, &iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret == -EINTR) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) { *attr = inode_to_stat(c, &inode_u); @@ -226,34 +222,37 @@ err: } } -static int do_create(struct bch_fs *c, u64 dir, +static int do_create(struct bch_fs *c, subvol_inum dir, const char *name, mode_t mode, dev_t rdev, struct bch_inode_unpacked *new_inode) { struct qstr qstr = QSTR(name); struct bch_inode_unpacked dir_u; - - dir = map_root_ino(dir); + uid_t uid = 0; + gid_t gid = 0; bch2_inode_init_early(c, new_inode); return bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, + bch2_create_trans(trans, dir, &dir_u, new_inode, &qstr, - 0, 0, mode, rdev, NULL, NULL)); + uid, gid, mode, rdev, NULL, NULL, + (subvol_inum) { 0 }, 0)); } -static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir, +static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir_ino, const char *name, mode_t mode, dev_t rdev) { + subvol_inum dir = map_root_ino(dir_ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked new_inode; int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_mknod(%llu, %s, %x, %x)\n", - dir, name, mode, rdev); + dir.inum, name, mode, rdev); + ret = do_create(c, dir, name, mode, rdev, &new_inode); if (ret) goto err; @@ -277,21 +276,19 @@ static void bcachefs_fuse_mkdir(fuse_req_t req, fuse_ino_t dir, bcachefs_fuse_mknod(req, dir, name, mode, 0); } -static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir, +static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir_ino, const char *name) { struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked dir_u, inode_u; struct qstr qstr = QSTR(name); - int ret; + subvol_inum dir = map_root_ino(dir_ino); - fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir, name); + fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir.inum, name); - dir = map_root_ino(dir); - - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_unlink_trans(&trans, dir, &dir_u, - &inode_u, &qstr)); + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_unlink_trans(trans, dir, &dir_u, + &inode_u, &qstr, false)); fuse_reply_err(req, -ret); } @@ -301,14 +298,12 @@ static void bcachefs_fuse_rmdir(fuse_req_t req, fuse_ino_t dir, { fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_rmdir(%llu, %s)\n", dir, name); - dir = map_root_ino(dir); - bcachefs_fuse_unlink(req, dir, name); } static void bcachefs_fuse_rename(fuse_req_t req, - fuse_ino_t src_dir, const char *srcname, - fuse_ino_t dst_dir, const char *dstname, + fuse_ino_t src_dir_ino, const char *srcname, + fuse_ino_t dst_dir_ino, const char *dstname, unsigned flags) { struct bch_fs *c = fuse_req_userdata(req); @@ -316,18 +311,17 @@ static void bcachefs_fuse_rename(fuse_req_t req, struct bch_inode_unpacked src_inode_u, dst_inode_u; struct qstr dst_name = QSTR(srcname); struct qstr src_name = QSTR(dstname); + subvol_inum src_dir = map_root_ino(src_dir_ino); + subvol_inum dst_dir = map_root_ino(dst_dir_ino); int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_rename(%llu, %s, %llu, %s, %x)\n", - src_dir, srcname, dst_dir, dstname, flags); - - src_dir = map_root_ino(src_dir); - dst_dir = map_root_ino(dst_dir); + src_dir.inum, srcname, dst_dir.inum, dstname, flags); /* XXX handle overwrites */ ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_rename_trans(&trans, + bch2_rename_trans(trans, src_dir, &src_dir_u, dst_dir, &dst_dir_u, &src_inode_u, &dst_inode_u, @@ -337,22 +331,22 @@ static void bcachefs_fuse_rename(fuse_req_t req, fuse_reply_err(req, -ret); } -static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t inum, - fuse_ino_t newparent, const char *newname) +static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino, + fuse_ino_t newparent_ino, const char *newname) { struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked dir_u, inode_u; struct qstr qstr = QSTR(newname); + subvol_inum newparent = map_root_ino(newparent_ino); + subvol_inum inum = map_root_ino(ino); int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n", - inum, newparent, newname); - - newparent = map_root_ino(newparent); + inum, newparent.inum, newname); ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_link_trans(&trans, newparent, - inum, &dir_u, &inode_u, &qstr)); + bch2_link_trans(trans, newparent, &dir_u, + inum, &inode_u, &qstr)); if (!ret) { struct fuse_entry_param e = inode_to_entry(c, &inode_u); @@ -375,22 +369,20 @@ static void bcachefs_fuse_open(fuse_req_t req, fuse_ino_t inum, static void userbio_init(struct bio *bio, struct bio_vec *bv, void *buf, size_t size) { - bio_init(bio, bv, 1); + bio_init(bio, NULL, bv, 1, 0); bio->bi_iter.bi_size = size; bv->bv_page = buf; bv->bv_len = size; bv->bv_offset = 0; } -static int get_inode_io_opts(struct bch_fs *c, u64 inum, - struct bch_io_opts *opts) +static int get_inode_io_opts(struct bch_fs *c, subvol_inum inum, struct bch_io_opts *opts) { struct bch_inode_unpacked inode; if (bch2_inode_find_by_inum(c, inum, &inode)) return -EINVAL; - *opts = bch2_opts_to_inode_opts(c->opts); - bch2_io_opts_apply(opts, bch2_inode_opts_get(&inode)); + bch2_inode_opts_get(opts, c, &inode); return 0; } @@ -448,7 +440,7 @@ static size_t align_fix_up_bytes(const struct fuse_align_io *align, /* * Read aligned data. */ -static int read_aligned(struct bch_fs *c, fuse_ino_t inum, size_t aligned_size, +static int read_aligned(struct bch_fs *c, subvol_inum inum, size_t aligned_size, off_t aligned_offset, void *buf) { BUG_ON(aligned_size & (block_bytes(c) - 1)); @@ -478,10 +470,11 @@ static int read_aligned(struct bch_fs *c, fuse_ino_t inum, size_t aligned_size, return -blk_status_to_errno(rbio.bio.bi_status); } -static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum, +static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi) { + subvol_inum inum = map_root_ino(ino); struct bch_fs *c = fuse_req_userdata(req); fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_read(%llu, %zd, %lld)\n", @@ -520,43 +513,43 @@ static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum, free(buf); } -static int inode_update_times(struct bch_fs *c, fuse_ino_t inum) +static int inode_update_times(struct bch_fs *c, subvol_inum inum) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bch_inode_unpacked inode_u; int ret = 0; u64 now; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); now = bch2_current_time(c); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; inode_u.bi_mtime = now; inode_u.bi_ctime = now; - ret = bch2_inode_write(&trans, &iter, &inode_u); + ret = bch2_inode_write(trans, &iter, &inode_u); if (ret) goto err; - ret = bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret == -EINTR) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } -static int write_aligned(struct bch_fs *c, fuse_ino_t inum, +static int write_aligned(struct bch_fs *c, subvol_inum inum, struct bch_io_opts io_opts, void *buf, size_t aligned_size, off_t aligned_offset, off_t new_i_size, size_t *written_out) @@ -576,7 +569,8 @@ static int write_aligned(struct bch_fs *c, fuse_ino_t inum, op.write_point = writepoint_hashed(0); op.nr_replicas = io_opts.data_replicas; op.target = io_opts.foreground_target; - op.pos = POS(inum, aligned_offset >> 9); + op.subvol = inum.subvol; + op.pos = POS(inum.inum, aligned_offset >> 9); op.new_i_size = new_i_size; userbio_init(&op.wbio.bio, &bv, buf, aligned_size); @@ -597,11 +591,12 @@ static int write_aligned(struct bch_fs *c, fuse_ino_t inum, return op.error; } -static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t inum, +static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { + subvol_inum inum = map_root_ino(ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_io_opts io_opts; size_t aligned_written; @@ -686,24 +681,23 @@ err: } static void bcachefs_fuse_symlink(fuse_req_t req, const char *link, - fuse_ino_t dir, const char *name) + fuse_ino_t dir_ino, const char *name) { + subvol_inum dir = map_root_ino(dir_ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked new_inode; size_t link_len = strlen(link); int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_symlink(%s, %llu, %s)\n", - link, dir, name); - - dir = map_root_ino(dir); + link, dir.inum, name); ret = do_create(c, dir, name, S_IFLNK|S_IRWXUGO, 0, &new_inode); if (ret) goto err; struct bch_io_opts io_opts; - ret = get_inode_io_opts(c, new_inode.bi_inum, &io_opts); + ret = get_inode_io_opts(c, dir, &io_opts); if (ret) goto err; @@ -715,8 +709,10 @@ static void bcachefs_fuse_symlink(fuse_req_t req, const char *link, memset(aligned_buf, 0, align.size); memcpy(aligned_buf, link, link_len); /* already terminated */ + subvol_inum inum = (subvol_inum) { dir.subvol, new_inode.bi_inum }; + size_t aligned_written; - ret = write_aligned(c, new_inode.bi_inum, io_opts, aligned_buf, + ret = write_aligned(c, inum, io_opts, aligned_buf, align.size, align.start, link_len + 1, &aligned_written); free(aligned_buf); @@ -727,7 +723,7 @@ static void bcachefs_fuse_symlink(fuse_req_t req, const char *link, size_t written = align_fix_up_bytes(&align, aligned_written); BUG_ON(written != link_len + 1); // TODO: handle short - ret = inode_update_times(c, new_inode.bi_inum); + ret = inode_update_times(c, inum); if (ret) goto err; @@ -741,12 +737,13 @@ err: fuse_reply_err(req, -ret); } -static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t inum) +static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t ino) { + subvol_inum inum = map_root_ino(ino); struct bch_fs *c = fuse_req_userdata(req); char *buf = NULL; - fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum); + fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum.inum); struct bch_inode_unpacked bi; int ret = bch2_inode_find_by_inum(c, inum, &bi); @@ -898,10 +895,11 @@ static bool handle_dots(struct fuse_dir_context *ctx, fuse_ino_t dir) return true; } -static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir, +static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir_ino, size_t size, off_t off, struct fuse_file_info *fi) { + subvol_inum dir = map_root_ino(dir_ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked bi; char *buf = calloc(size, 1); @@ -915,9 +913,7 @@ static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir, int ret = 0; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readdir(dir=%llu, size=%zu, " - "off=%lld)\n", dir, size, off); - - dir = map_root_ino(dir); + "off=%lld)\n", dir.inum, size, off); ret = bch2_inode_find_by_inum(c, dir, &bi); if (ret) @@ -928,7 +924,7 @@ static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir, goto reply; } - if (!handle_dots(&ctx, dir)) + if (!handle_dots(&ctx, dir.inum)) goto reply; ret = bch2_readdir(c, dir, &ctx.ctx); @@ -1012,16 +1008,17 @@ static void bcachefs_fuse_removexattr(fuse_req_t req, fuse_ino_t inum, } #endif -static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir, +static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir_ino, const char *name, mode_t mode, struct fuse_file_info *fi) { + subvol_inum dir = map_root_ino(dir_ino); struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked new_inode; int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_create(%llu, %s, %x)\n", - dir, name, mode); + dir.inum, name, mode); ret = do_create(c, dir, name, mode, 0, &new_inode); if (ret) @@ -1032,7 +1029,6 @@ static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir, return; err: fuse_reply_err(req, -ret); - } #if 0 @@ -1222,6 +1218,17 @@ int cmd_fusemount(int argc, char *argv[]) } tokenize_devices(&ctx); + struct printbuf fsname = PRINTBUF; + prt_printf(&fsname, "fsname="); + for (i = 0; i < ctx.nr_devices; ++i) { + if (i) + prt_str(&fsname, ":"); + prt_str(&fsname, ctx.devices[i]); + } + + fuse_opt_add_arg(&args, "-o"); + fuse_opt_add_arg(&args, fsname.buf); + /* Open bch */ printf("Opening bcachefs filesystem on:\n"); for (i = 0; i < ctx.nr_devices; ++i) diff --git a/cmd_key.c b/cmd_key.c index e8c3eea..96206c4 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -92,7 +92,7 @@ int cmd_set_passphrase(int argc, char *argv[]) if (IS_ERR(c)) die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c))); - struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); + struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) die("Filesystem does not have encryption enabled"); @@ -111,6 +111,7 @@ int cmd_set_passphrase(int argc, char *argv[]) die("error encrypting key"); crypt->key = new_key; + bch2_revoke_key(c->disk_sb.sb); bch2_write_super(c); bch2_fs_stop(c); return 0; @@ -129,7 +130,7 @@ int cmd_remove_passphrase(int argc, char *argv[]) if (IS_ERR(c)) die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c))); - struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); + struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) die("Filesystem does not have encryption enabled"); diff --git a/cmd_kill_btree_node.c b/cmd_kill_btree_node.c index a8915a1..21b78f1 100644 --- a/cmd_kill_btree_node.c +++ b/cmd_kill_btree_node.c @@ -11,6 +11,7 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/errcode.h" #include "libbcachefs/error.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super.h" static void kill_btree_node_usage(void) @@ -40,7 +41,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) switch (opt) { case 'b': btree_id = read_string_list_or_die(optarg, - bch2_btree_ids, "btree id"); + __bch2_btree_ids, "btree id"); break; case 'l': if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH) @@ -63,7 +64,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) if (IS_ERR(c)) die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c))); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct btree *b; int ret; @@ -73,9 +74,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) if (ret) die("error %s from posix_memalign", bch2_err_str(ret)); - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) { + __for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) { if (b->c.level != level) continue; @@ -112,8 +111,8 @@ int cmd_kill_btree_node(int argc, char *argv[]) bch_err(c, "node at specified index not found"); ret = EXIT_FAILURE; done: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); bch2_fs_stop(c); return ret; diff --git a/cmd_list_journal.c b/cmd_list_journal.c index 655bfe2..70f7e66 100644 --- a/cmd_list_journal.c +++ b/cmd_list_journal.c @@ -252,7 +252,7 @@ int cmd_list_journal(int argc, char *argv[]) darray_push(&transaction_filter, bbpos_parse(optarg)); break; case 'k': - darray_push(&key_filter, read_string_list_or_die(optarg, bch2_btree_ids, "btree id")); + darray_push(&key_filter, read_string_list_or_die(optarg, __bch2_btree_ids, "btree id")); break; case 'v': opt_set(opts, verbose, true); diff --git a/cmd_migrate.c b/cmd_migrate.c index 3958ba6..cde1fce 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -33,7 +33,7 @@ #include "libbcachefs/errcode.h" #include "libbcachefs/fs-common.h" #include "libbcachefs/inode.h" -#include "libbcachefs/io.h" +#include "libbcachefs/io_write.h" #include "libbcachefs/replicas.h" #include "libbcachefs/str_hash.h" #include "libbcachefs/super.h" @@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c, bch2_inode_pack(&packed, inode); packed.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, - NULL, NULL, 0); + NULL, 0); if (ret) die("error updating inode: %s", bch2_err_str(ret)); } @@ -140,7 +140,7 @@ static void create_link(struct bch_fs *c, struct bch_inode_unpacked inode; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_link_trans(&trans, + bch2_link_trans(trans, (subvol_inum) { 1, parent->bi_inum }, &parent_u, (subvol_inum) { 1, inum }, &inode, &qstr)); if (ret) @@ -159,7 +159,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, bch2_inode_init_early(c, &new_inode); int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, + bch2_create_trans(trans, (subvol_inum) { 1, parent->bi_inum }, parent, &new_inode, &qstr, uid, gid, mode, rdev, NULL, NULL, @@ -232,7 +232,7 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, struct bch_inode_unpacked inode_u; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, + bch2_xattr_set(trans, (subvol_inum) { 1, dst->bi_inum }, &inode_u, &hash_info, attr, val, val_size, h->flags, 0)); @@ -339,8 +339,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", bch2_err_str(ret)); - ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, - &res, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0); if (ret) die("btree insert error %s", bch2_err_str(ret)); @@ -670,20 +669,24 @@ static int migrate_fs(const char *fs_path, struct dev_opts dev = dev_opts_default(); dev.path = dev_t_to_path(stat.st_dev); - dev.fd = xopen(dev.path, O_RDWR); + dev.bdev = blkdev_get_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL); - opt_set(fs_opts, block_size, get_blocksize(dev.path, dev.fd)); + opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_buffered_fd)); char *file_path = mprintf("%s/bcachefs", fs_path); printf("Creating new filesystem on %s in space reserved at %s\n", dev.path, file_path); - bch2_pick_bucket_size(fs_opts, &dev); + dev.size = get_size(dev.bdev->bd_buffered_fd); + dev.bucket_size = bch2_pick_bucket_size(fs_opts, &dev); + dev.nbuckets = dev.size / dev.bucket_size; + + bch2_check_bucket_size(fs_opts, &dev); u64 bcachefs_inum; ranges extents = reserve_new_fs_space(file_path, fs_opts.block_size >> 9, - get_size(dev.path, dev.fd) / 5, + get_size(dev.bdev->bd_buffered_fd) / 5, &bcachefs_inum, stat.st_dev, force); find_superblock_space(extents, format_opts, &dev); diff --git a/cmd_version.c b/cmd_version.c index 3fb4b6e..5fe30e5 100644 --- a/cmd_version.c +++ b/cmd_version.c @@ -4,6 +4,6 @@ int cmd_version(int argc, char *argv[]) { - printf("bcachefs tool version %s\n", VERSION_STRING); + printf("%s\n", VERSION_STRING); return 0; } diff --git a/cmds.h b/cmds.h index 96216b2..5b3f5f5 100644 --- a/cmds.h +++ b/cmds.h @@ -13,13 +13,6 @@ int cmd_format(int argc, char *argv[]); int cmd_show_super(int argc, char *argv[]); int cmd_set_option(int argc, char *argv[]); -#if 0 -int cmd_assemble(int argc, char *argv[]); -int cmd_incremental(int argc, char *argv[]); -int cmd_run(int argc, char *argv[]); -int cmd_stop(int argc, char *argv[]); -#endif - int cmd_fs_usage(int argc, char *argv[]); int device_usage(void); @@ -60,6 +53,7 @@ int cmd_subvolume_delete(int argc, char *argv[]); int cmd_subvolume_snapshot(int argc, char *argv[]); int cmd_fusemount(int argc, char *argv[]); -void cmd_mount(int agc, char *argv[]); +int cmd_mount(int argc, char *argv[]); +int cmd_completions(int argc, char *argv[]); #endif /* _CMDS_H */ diff --git a/crypto.c b/crypto.c index 4e4d15a..32671bd 100644 --- a/crypto.c +++ b/crypto.c @@ -105,7 +105,7 @@ bool bch2_sb_is_encrypted(struct bch_sb *sb) { struct bch_sb_field_crypt *crypt; - return (crypt = bch2_sb_get_crypt(sb)) && + return (crypt = bch2_sb_field_get(sb, crypt)) && bch2_key_is_encrypted(&crypt->key); } @@ -113,7 +113,7 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase, struct bch_key *passphrase_key, struct bch_encrypted_key *sb_key) { - struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(sb); + struct bch_sb_field_crypt *crypt = bch2_sb_field_get(sb, crypt); if (!crypt) die("filesystem is not encrypted"); diff --git a/debian/changelog b/debian/changelog index a1bfec3..f1e4e5e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium + + * New upstream release + + -- Jonathan Carter Tue, 21 Nov 2023 17:26:13 +0200 + bcachefs-tools (24+really1.2-1) unstable; urgency=medium * New upstream release (Closes: #1054613) diff --git a/default.nix b/default.nix index 2cccff2..80aeb43 100644 --- a/default.nix +++ b/default.nix @@ -1,10 +1,6 @@ -(import - ( - let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in - fetchTarball { - url = "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz"; - sha256 = lock.nodes.flake-compat.locked.narHash; - } - ) - { src = ./.; } -).defaultNix +(import (let lock = builtins.fromJSON (builtins.readFile ./flake.lock); +in fetchTarball { + url = + "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz"; + sha256 = lock.nodes.flake-compat.locked.narHash; +}) { src = ./.; }).defaultNix diff --git a/flake.lock b/flake.lock index 899f297..413959e 100644 --- a/flake.lock +++ b/flake.lock @@ -3,11 +3,11 @@ "flake-compat": { "flake": false, "locked": { - "lastModified": 1673956053, - "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=", + "lastModified": 1696426674, + "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", "owner": "edolstra", "repo": "flake-compat", - "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9", + "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", "type": "github" }, "original": { @@ -18,11 +18,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1686592866, - "narHash": "sha256-riGg89eWhXJcPNrQGcSwTEEm7CGxWC06oSX44hajeMw=", + "lastModified": 1698924604, + "narHash": "sha256-GCFbkl2tj8fEZBZCw3Tc0AkGo0v+YrQlohhEGJ/X4s0=", "owner": "nixos", "repo": "nixpkgs", - "rev": "0eeebd64de89e4163f4d3cf34ffe925a5cf67a05", + "rev": "fa804edfb7869c9fb230e174182a8a1a7e512c40", "type": "github" }, "original": { @@ -59,11 +59,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1685518550, - "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=", + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", "owner": "numtide", "repo": "flake-utils", - "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 0f8e90d..c4185e7 100644 --- a/flake.nix +++ b/flake.nix @@ -10,13 +10,30 @@ }; outputs = { self, nixpkgs, utils, ... }: - utils.lib.eachDefaultSystem (system: + { + overlays.default = final: prev: { + bcachefs = final.callPackage ./build.nix { }; + }; + } // utils.lib.eachDefaultSystem (system: let - pkgs = nixpkgs.legacyPackages.${system}; - bcachefs = pkgs.callPackage ./build.nix {}; + pkgs = import nixpkgs { + inherit system; + overlays = [ self.overlays.default ]; + }; in { packages = { - default = bcachefs; + inherit (pkgs) bcachefs; + default = pkgs.bcachefs; }; + + formatter = pkgs.nixfmt; + + devShells.default = pkgs.callPackage ({ mkShell, rustc, cargo, gnumake + , gcc, clang, pkg-config, libuuid, libsodium, keyutils, liburcu, zlib + , libaio, zstd, lz4, udev, bcachefs }: + mkShell { + LIBCLANG_PATH = "${clang.cc.lib}/lib"; + inherit (bcachefs) nativeBuildInputs buildInputs; + }) { }; }); } diff --git a/fsck.bcachefs b/fsck.bcachefs deleted file mode 100755 index f8de4a8..0000000 --- a/fsck.bcachefs +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -SDIR="$(readlink -f "$0")" -exec "${SDIR%/*}/bcachefs" fsck "$@" diff --git a/include/linux/atomic.h b/include/linux/atomic.h index f4d047c..2c983cd 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -47,6 +47,7 @@ typedef struct { #define smp_rmb() cmm_smp_rmb() #define smp_mb() cmm_smp_mb() #define smp_read_barrier_depends() cmm_smp_read_barrier_depends() +#define smp_acquire__after_ctrl_dep() cmm_smp_mb() #else /* C11_ATOMICS */ @@ -205,6 +206,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v) \ return __ATOMIC_DEC_RETURN(&v->counter); \ } \ \ +static inline i_type a_type##_dec_return_release(a_type##_t *v) \ +{ \ + return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter); \ +} \ + \ static inline void a_type##_inc(a_type##_t *v) \ { \ __ATOMIC_INC(&v->counter); \ @@ -318,6 +324,12 @@ static inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) return atomic64_cmpxchg(v, old, new); } +static inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) +{ + smp_mb__before_atomic(); + return atomic64_sub_return(i, v); +} + #endif #endif /* __TOOLS_LINUX_ATOMIC_H */ diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index 62b91af..873f08c 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -6,38 +6,78 @@ #include #include +/* + * The futex wait op wants an explicit 32-bit address and value. If the bitmap + * used for the spinlock is 64-bit, cast down and pass the right 32-bit region + * for the in-kernel checks. The value is the copy that has already been read + * from the atomic op. + * + * The futex wake op interprets the value as the number of waiters to wake (up + * to INT_MAX), so pass that along directly. + */ +static inline void do_futex(int nr, unsigned long *addr, unsigned long v, int futex_flags) +{ + u32 *addr32 = (u32 *) addr; + u32 *v32 = (u32 *) &v; + int shift = 0; + + futex_flags |= FUTEX_PRIVATE_FLAG; + +#if BITS_PER_LONG == 64 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + shift = (nr >= 32) ? 1 : 0; +#else + shift = (nr < 32) ? 1 : 0; +#endif +#endif + if (shift) { + addr32 += shift; + v32 += shift; + } + /* + * The shift to determine the futex address may have cast away a + * literal wake count value. The value is capped to INT_MAX and thus + * always in the low bytes of v regardless of bit nr. Copy in the wake + * count to whatever 32-bit range was selected. + */ + if (futex_flags == FUTEX_WAKE_PRIVATE) + *v32 = (u32) v; + futex(addr32, futex_flags, *v32, NULL, NULL, 0); +} + static inline void bit_spin_lock(int nr, unsigned long *_addr) { - u32 mask, *addr = ((u32 *) _addr) + (nr / 32), v; + unsigned long mask; + unsigned long *addr = _addr + (nr / BITS_PER_LONG); + unsigned long v; - nr &= 31; - mask = 1U << nr; + nr &= BITS_PER_LONG - 1; + mask = 1UL << nr; while (1) { v = __atomic_fetch_or(addr, mask, __ATOMIC_ACQUIRE); if (!(v & mask)) break; - futex(addr, FUTEX_WAIT|FUTEX_PRIVATE_FLAG, v, NULL, NULL, 0); + do_futex(nr, addr, v, FUTEX_WAIT); } } static inline void bit_spin_wake(int nr, unsigned long *_addr) { - u32 *addr = ((u32 *) _addr) + (nr / 32); - - futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0); + do_futex(nr, _addr, INT_MAX, FUTEX_WAKE); } static inline void bit_spin_unlock(int nr, unsigned long *_addr) { - u32 mask, *addr = ((u32 *) _addr) + (nr / 32); + unsigned long mask; + unsigned long *addr = _addr + (nr / BITS_PER_LONG); - nr &= 31; - mask = 1U << nr; + nr &= BITS_PER_LONG - 1; + mask = 1UL << nr; __atomic_and_fetch(addr, ~mask, __ATOMIC_RELEASE); - futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0); + do_futex(nr, addr, INT_MAX, FUTEX_WAKE); } #endif /* __LINUX_BIT_SPINLOCK_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d378ab..3914311 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -6,6 +6,8 @@ #include #include +#define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) + #define BIO_MAX_VECS 256U typedef unsigned fmode_t; @@ -21,30 +23,20 @@ struct user_namespace; #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) -/* file is open for reading */ -#define FMODE_READ ((__force fmode_t)0x1) -/* file is open for writing */ -#define FMODE_WRITE ((__force fmode_t)0x2) -/* file is seekable */ -#define FMODE_LSEEK ((__force fmode_t)0x4) -/* file can be accessed using pread */ -#define FMODE_PREAD ((__force fmode_t)0x8) -/* file can be accessed using pwrite */ -#define FMODE_PWRITE ((__force fmode_t)0x10) -/* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)0x20) -/* File is opened with O_NDELAY (only set for block devices) */ -#define FMODE_NDELAY ((__force fmode_t)0x40) -/* File is opened with O_EXCL (only set for block devices) */ -#define FMODE_EXCL ((__force fmode_t)0x80) -/* File is opened using open(.., 3, ..) and is writeable only for ioctls - (specialy hack for floppy.c) */ -#define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) -/* 32bit hashes as llseek() offset (for directories) */ -#define FMODE_32BITHASH ((__force fmode_t)0x200) -/* 64bit hashes as llseek() offset (for directories) */ -#define FMODE_64BITHASH ((__force fmode_t)0x400) -#define FMODE_BUFFERED ((__force fmode_t)0x800) +typedef unsigned int __bitwise blk_mode_t; + +/* open for reading */ +#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) +/* open for writing */ +#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) +/* open exclusively (vs other exclusive openers */ +#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) +/* opened with O_NDELAY */ +#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) +/* open for "writes" only for ioctls (specialy hack for floppy.c) */ +#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) + +#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5)) struct inode { unsigned long i_ino; @@ -93,9 +85,14 @@ int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsig unsigned bdev_logical_block_size(struct block_device *bdev); sector_t get_capacity(struct gendisk *disk); -void blkdev_put(struct block_device *bdev, fmode_t mode); +struct blk_holder_ops { + void (*mark_dead)(struct block_device *bdev); +}; + +void blkdev_put(struct block_device *bdev, void *holder); void bdput(struct block_device *bdev); -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop); int lookup_bdev(const char *path, dev_t *); struct super_block { diff --git a/include/linux/closure.h b/include/linux/closure.h index 722a586..de7bb47 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -154,6 +154,7 @@ struct closure { struct closure *parent; atomic_t remaining; + bool closure_get_happened; #ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead @@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl) */ static inline void closure_sync(struct closure *cl) { - if (closure_nr_remaining(cl) != 1) +#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened); +#endif + + if (cl->closure_get_happened) __closure_sync(cl); } @@ -233,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, closure_set_ip(cl); cl->fn = fn; cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic(); } static inline void closure_queue(struct closure *cl) @@ -259,6 +262,8 @@ static inline void closure_queue(struct closure *cl) */ static inline void closure_get(struct closure *cl) { + cl->closure_get_happened = true; + #ifdef CONFIG_DEBUG_CLOSURES BUG_ON((atomic_inc_return(&cl->remaining) & CLOSURE_REMAINING_MASK) <= 1); @@ -281,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent) closure_get(parent); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + cl->closure_get_happened = false; closure_debug_create(cl); closure_set_ip(cl); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 39df1f1..5778690 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -65,6 +65,11 @@ #define unreachable() __builtin_unreachable() #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #define fallthrough __attribute__((__fallthrough__)) +#define __noreturn __attribute__((__noreturn__)) + +#ifndef __counted_by +#define __counted_by(nr) +#endif #define ___PASTE(a,b) a##b #define __PASTE(a,b) ___PASTE(a,b) diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index c74b737..8474131 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -191,8 +191,8 @@ void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, size_t, size_t); /** - * genradix_iter_peek - get first entry at or below iterator's current - * position + * genradix_iter_peek_prev - get first entry at or below iterator's current + * position * @_iter: a genradix_iter * @_radix: genradix being iterated over * diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 35a7207..f9a5712 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -278,4 +278,7 @@ static inline void dump_stack(void) {} #define unsafe_memcpy(dst, src, bytes, justification) \ memcpy(dst, src, bytes) +#define DECLARE_FLEX_ARRAY(TYPE, NAME) \ + __DECLARE_FLEX_ARRAY(TYPE, NAME) + #endif diff --git a/include/linux/list.h b/include/linux/list.h index bdd09ef..d176d0d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n) pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) +static inline size_t list_count_nodes(struct list_head *head) +{ + struct list_head *pos; + size_t count = 0; + + list_for_each(pos, head) + count++; + + return count; +} + #endif /* _LIST_LIST_H */ diff --git a/include/linux/page.h b/include/linux/page.h index e2dda66..111e5e6 100644 --- a/include/linux/page.h +++ b/include/linux/page.h @@ -26,6 +26,9 @@ struct page; #define kmap_atomic(page) page_address(page) #define kunmap_atomic(addr) do {} while (0) +#define kmap_local_page(page) page_address(page) +#define kunmap_local(addr) do {} while (0) + #define PageHighMem(page) false static const char zero_page[PAGE_SIZE]; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ef03253..ec5f478 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -12,7 +12,7 @@ #define rcu_access_pointer(p) READ_ONCE(p) #define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ -#define kvfree_rcu(ptr) kfree(ptr) /* XXX */ +#define kvfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) diff --git a/include/linux/sched.h b/include/linux/sched.h index c5c8e3a..7afb6d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -107,7 +107,12 @@ extern __thread struct task_struct *current; #define set_current_state(state_value) \ smp_store_mb(current->state, (state_value)) -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +static inline struct task_struct *get_task_struct(struct task_struct *task) +{ + atomic_inc(&task->usage); + return task; + +} extern void __put_task_struct(struct task_struct *t); @@ -146,6 +151,14 @@ static inline u64 ktime_get_seconds(void) return ts.tv_sec; } +static inline u64 ktime_get_real_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + return timespec_to_ns(&ts); +} + static inline u64 ktime_get_real_seconds(void) { struct timespec ts; diff --git a/include/linux/six.h b/include/linux/six.h deleted file mode 100644 index 394da42..0000000 --- a/include/linux/six.h +++ /dev/null @@ -1,388 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_SIX_H -#define _LINUX_SIX_H - -/** - * DOC: SIX locks overview - * - * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores - * but with an additional state: read/shared, intent, exclusive/write - * - * The purpose of the intent state is to allow for greater concurrency on tree - * structures without deadlocking. In general, a read can't be upgraded to a - * write lock without deadlocking, so an operation that updates multiple nodes - * will have to take write locks for the full duration of the operation. - * - * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at thte start of the operation, - * and then take write locks only for the actual update to each individual - * nodes, without deadlocking. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * - * An intent lock must be held before taking a write lock: - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); - * - * Other operations: - * six_trylock_read() - * six_trylock_intent() - * six_trylock_write() - * - * six_lock_downgrade() convert from intent to read - * six_lock_tryupgrade() attempt to convert from read to intent, may fail - * - * There are also interfaces that take the lock type as an enum: - * - * six_lock_type(&foo->lock, SIX_LOCK_read); - * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) - * six_lock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_intent); - * - * Lock sequence numbers - unlock(), relock(): - * - * Locks embed sequences numbers, which are incremented on write lock/unlock. - * This allows locks to be dropped and the retaken iff the state they protect - * hasn't changed; this makes it much easier to avoid holding locks while e.g. - * doing IO or allocating memory. - * - * Example usage: - * six_lock_read(&foo->lock); - * u32 seq = six_lock_seq(&foo->lock); - * six_unlock_read(&foo->lock); - * - * some_operation_that_may_block(); - * - * if (six_relock_read(&foo->lock, seq)) { ... } - * - * If the relock operation succeeds, it is as if the lock was never unlocked. - * - * Reentrancy: - * - * Six locks are not by themselves reentrent, but have counters for both the - * read and intent states that can be used to provide reentrency by an upper - * layer that tracks held locks. If a lock is known to already be held in the - * read or intent state, six_lock_increment() can be used to bump the "lock - * held in this state" counter, increasing the number of unlock calls that - * will be required to fully unlock it. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_lock_increment(&foo->lock, SIX_LOCK_read); - * six_unlock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * foo->lock is now fully unlocked. - * - * Since the intent state supercedes read, it's legal to increment the read - * counter when holding an intent lock, but not the reverse. - * - * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) - * is not legal. - * - * should_sleep_fn: - * - * There is a six_lock() variant that takes a function pointer that is called - * immediately prior to schedule() when blocking, and may return an error to - * abort. - * - * One possible use for this feature is when objects being locked are part of - * a cache and may reused, and lock ordering is based on a property of the - * object that will change when the object is reused - i.e. logical key order. - * - * If looking up an object in the cache may race with object reuse, and lock - * ordering is required to prevent deadlock, object reuse may change the - * correct lock order for that object and cause a deadlock. should_sleep_fn - * can be used to check if the object is still the object we want and avoid - * this deadlock. - * - * Wait list entry interface: - * - * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a - * wait list entry. By embedding six_lock_waiter into another object, and by - * traversing lock waitlists, it is then possible for an upper layer to - * implement full cycle detection for deadlock avoidance. - * - * should_sleep_fn should be used for invoking the cycle detector, walking the - * graph of held locks to check for a deadlock. The upper layer must track - * held locks for each thread, and each thread's held locks must be reachable - * from its six_lock_waiter object. - * - * six_lock_waiter() will add the wait object to the waitlist re-trying taking - * the lock, and before calling should_sleep_fn, and the wait object will not - * be removed from the waitlist until either the lock has been successfully - * acquired, or we aborted because should_sleep_fn returned an error. - * - * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will - * have timestamps in strictly ascending order - this is so the timestamp can - * be used as a cursor for lock graph traverse. - */ - -#include -#include -#include -#include - -enum six_lock_type { - SIX_LOCK_read, - SIX_LOCK_intent, - SIX_LOCK_write, -}; - -struct six_lock { - atomic_t state; - u32 seq; - unsigned intent_lock_recurse; - struct task_struct *owner; - unsigned __percpu *readers; - struct optimistic_spin_queue osq; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; - enum six_lock_type lock_want; - bool lock_acquired; - u64 start_time; -}; - -typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); - -void six_lock_exit(struct six_lock *lock); - -enum six_lock_init_flags { - SIX_LOCK_INIT_PCPU = 1U << 0, -}; - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); - -/** - * six_lock_init - initialize a six lock - * @lock: lock to initialize - * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU - */ -#define six_lock_init(lock, flags) \ -do { \ - static struct lock_class_key __key; \ - \ - __six_lock_init((lock), #lock, &__key, flags); \ -} while (0) - -/** - * six_lock_seq - obtain current lock sequence number - * @lock: six_lock to obtain sequence number for - * - * @lock should be held for read or intent, and not write - * - * By saving the lock sequence number, we can unlock @lock and then (typically - * after some blocking operation) attempt to relock it: the relock will succeed - * if the sequence number hasn't changed, meaning no write locks have been taken - * and state corresponding to what @lock protects is still valid. - */ -static inline u32 six_lock_seq(const struct six_lock *lock) -{ - return lock->seq; -} - -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_trylock_type - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * Return: true on success, false on failure. - */ -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - return six_trylock_ip(lock, type, _THIS_IP_); -} - -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip); - -/** - * six_lock_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * This is a convenience wrapper around six_lock_ip_waiter(), see that function - * for full documentation. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -} - -/** - * six_lock_ip - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -} - -/** - * six_lock_type - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -} - -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip); - -/** - * six_relock_type - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * - * Return: true on success, false on failure. - */ -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - return six_relock_ip(lock, type, seq, _THIS_IP_); -} - -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_unlock_type - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - six_unlock_ip(lock, type, _THIS_IP_); -} - -#define __SIX_LOCK(type) \ -static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} \ - \ -static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p,\ - unsigned long ip) \ -{ \ - return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -} \ - \ -static inline int six_lock_ip_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p, \ - unsigned long ip) \ -{ \ - return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -} \ - \ -static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -} \ - \ -static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -} \ - \ -static inline int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn fn, void *p)\ -{ \ - return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -} \ - \ -static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK - -void six_lock_downgrade(struct six_lock *); -bool six_lock_tryupgrade(struct six_lock *); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); - -void six_lock_increment(struct six_lock *, enum six_lock_type); - -void six_lock_wakeup_all(struct six_lock *); - -struct six_lock_count { - unsigned n[3]; -}; - -struct six_lock_count six_lock_counts(struct six_lock *); -void six_lock_readers_add(struct six_lock *, int); - -#endif /* _LINUX_SIX_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 25ccf1a..ca0c793 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -27,7 +27,8 @@ static inline void *kmalloc_noprof(size_t size, gfp_t flags) for (i = 0; i < 10; i++) { if (size) { - size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE); + size_t alignment = min_t(size_t, PAGE_SIZE, + rounddown_pow_of_two(size)); alignment = max(sizeof(void *), alignment); if (posix_memalign(&p, alignment, size)) p = NULL; diff --git a/libbcachefs.c b/libbcachefs.c index bac772b..123109f 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -64,44 +64,44 @@ static u64 min_size(unsigned bucket_size) return BCH_MIN_NR_NBUCKETS * bucket_size; } -void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) +u64 bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) { - if (!dev->size) - dev->size = get_size(dev->path, dev->fd); + u64 bucket_size; - if (!dev->bucket_size) { - if (dev->size < min_size(opts.block_size)) - die("cannot format %s, too small (%llu bytes, min %llu)", - dev->path, dev->size, min_size(opts.block_size)); + if (dev->size < min_size(opts.block_size)) + die("cannot format %s, too small (%llu bytes, min %llu)", + dev->path, dev->size, min_size(opts.block_size)); - /* Bucket size must be >= block size: */ - dev->bucket_size = opts.block_size; + /* Bucket size must be >= block size: */ + bucket_size = opts.block_size; - /* Bucket size must be >= btree node size: */ - if (opt_defined(opts, btree_node_size)) - dev->bucket_size = max_t(unsigned, dev->bucket_size, - opts.btree_node_size); + /* Bucket size must be >= btree node size: */ + if (opt_defined(opts, btree_node_size)) + bucket_size = max_t(unsigned, bucket_size, + opts.btree_node_size); - /* Want a bucket size of at least 128k, if possible: */ - dev->bucket_size = max(dev->bucket_size, 128ULL << 10); + /* Want a bucket size of at least 128k, if possible: */ + bucket_size = max(bucket_size, 128ULL << 10); - if (dev->size >= min_size(dev->bucket_size)) { - unsigned scale = max(1, - ilog2(dev->size / min_size(dev->bucket_size)) / 4); + if (dev->size >= min_size(bucket_size)) { + unsigned scale = max(1, + ilog2(dev->size / min_size(bucket_size)) / 4); - scale = rounddown_pow_of_two(scale); + scale = rounddown_pow_of_two(scale); - /* max bucket size 1 mb */ - dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20); - } else { - do { - dev->bucket_size /= 2; - } while (dev->size < min_size(dev->bucket_size)); - } + /* max bucket size 1 mb */ + bucket_size = min(bucket_size * scale, 1ULL << 20); + } else { + do { + bucket_size /= 2; + } while (dev->size < min_size(bucket_size)); } - dev->nbuckets = dev->size / dev->bucket_size; + return bucket_size; +} +void bch2_check_bucket_size(struct bch_opts opts, struct dev_opts *dev) +{ if (dev->bucket_size < opts.block_size) die("Bucket size (%llu) cannot be smaller than block size (%u)", dev->bucket_size, opts.block_size); @@ -150,13 +150,12 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, { struct bch_sb_handle sb = { NULL }; struct dev_opts *i; - struct bch_sb_field_members *mi; unsigned max_dev_block_size = 0; unsigned opt_id; + u64 min_bucket_size = U64_MAX; for (i = devs; i < devs + nr_devs; i++) - max_dev_block_size = max(max_dev_block_size, - get_blocksize(i->path, i->fd)); + max_dev_block_size = max(max_dev_block_size, get_blocksize(i->bdev->bd_buffered_fd)); /* calculate block size: */ if (!opt_defined(fs_opts, block_size)) { @@ -165,9 +164,24 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, die("blocksize too small: %u, must be greater than device blocksize %u", fs_opts.block_size, max_dev_block_size); + /* get device size, if it wasn't specified: */ + for (i = devs; i < devs + nr_devs; i++) + if (!i->size) + i->size = get_size(i->bdev->bd_buffered_fd); + /* calculate bucket sizes: */ for (i = devs; i < devs + nr_devs; i++) - bch2_pick_bucket_size(fs_opts, i); + min_bucket_size = min(min_bucket_size, + i->bucket_size ?: bch2_pick_bucket_size(fs_opts, i)); + + for (i = devs; i < devs + nr_devs; i++) + if (!i->bucket_size) + i->bucket_size = min_bucket_size; + + for (i = devs; i < devs + nr_devs; i++) { + i->nbuckets = i->size / i->bucket_size; + bch2_check_bucket_size(fs_opts, i); + } /* calculate btree node size: */ if (!opt_defined(fs_opts, btree_node_size)) { @@ -193,7 +207,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, sb.sb->nr_devices = nr_devs; if (opts.version == bcachefs_metadata_version_current) - sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); uuid_generate(sb.sb->uuid.b); @@ -222,12 +236,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, sb.sb->time_precision = cpu_to_le32(1); /* Member info: */ - mi = bch2_sb_resize_members(&sb, + struct bch_sb_field_members_v2 *mi = + bch2_sb_field_resize(&sb, members_v2, (sizeof(*mi) + sizeof(struct bch_member) * nr_devs) / sizeof(u64)); - + mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); for (i = devs; i < devs + nr_devs; i++) { - struct bch_member *m = mi->members + (i - devs); + struct bch_member *m = bch2_members_v2_get_mut(sb.sb, (i - devs)); uuid_generate(m->uuid.b); m->nbuckets = cpu_to_le64(i->nbuckets); @@ -255,9 +270,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, * Recompute mi and m after each sb modification: its location * in memory may have changed due to reallocation. */ - mi = bch2_sb_get_members(sb.sb); - m = mi->members + (i - devs); - + m = bch2_members_v2_get_mut(sb.sb, (i - devs)); SET_BCH_MEMBER_GROUP(m, idx + 1); } @@ -273,12 +286,14 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* Crypt: */ if (opts.encrypted) { struct bch_sb_field_crypt *crypt = - bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64)); + bch2_sb_field_resize(&sb, crypt, sizeof(*crypt) / sizeof(u64)); bch_sb_crypt_init(sb.sb, crypt, opts.passphrase); SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1); } + bch2_sb_members_cpy_v2_v1(&sb); + for (i = devs; i < devs + nr_devs; i++) { u64 size_sectors = i->size >> 9; @@ -312,12 +327,12 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* Zero start of disk */ static const char zeroes[BCH_SB_SECTOR << 9]; - xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0, + xpwrite(i->bdev->bd_buffered_fd, zeroes, BCH_SB_SECTOR << 9, 0, "zeroing start of disk"); } - bch2_super_write(i->fd, sb.sb); - close(i->fd); + bch2_super_write(i->bdev->bd_buffered_fd, sb.sb); + close(i->bdev->bd_buffered_fd); } return sb.sb; @@ -510,7 +525,7 @@ int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd) case BCH_DATA_btree: case BCH_DATA_user: printf(" %s:%llu:%llu", - bch2_btree_ids[e.p.btree_id], + bch2_btree_id_str(e.p.btree_id), e.p.pos.inode, e.p.pos.offset); } diff --git a/libbcachefs.h b/libbcachefs.h index ba5d380..a16ae86 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -52,7 +52,7 @@ static inline struct format_opts format_opts_default() } struct dev_opts { - int fd; + struct block_device *bdev; char *path; u64 size; /* bytes*/ u64 bucket_size; /* bytes */ @@ -75,7 +75,9 @@ static inline struct dev_opts dev_opts_default() }; } -void bch2_pick_bucket_size(struct bch_opts, struct dev_opts *); +u64 bch2_pick_bucket_size(struct bch_opts, struct dev_opts *); +void bch2_check_bucket_size(struct bch_opts, struct dev_opts *); + struct bch_sb *bch2_format(struct bch_opt_strs, struct bch_opts, struct format_opts, struct dev_opts *, size_t); diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index b1a4888..f380989 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -1,18 +1,71 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_BCACHEFS_POSIX_ACL #include "bcachefs.h" -#include +#include "acl.h" +#include "xattr.h" + #include + +static const char * const acl_types[] = { + [ACL_USER_OBJ] = "user_obj", + [ACL_USER] = "user", + [ACL_GROUP_OBJ] = "group_obj", + [ACL_GROUP] = "group", + [ACL_MASK] = "mask", + [ACL_OTHER] = "other", + NULL, +}; + +void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) +{ + const void *p, *end = value + size; + + if (!value || + size < sizeof(bch_acl_header) || + ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) + return; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + unsigned tag = le16_to_cpu(in->e_tag); + + prt_str(out, acl_types[tag]); + + switch (tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + } + + prt_printf(out, " %o", le16_to_cpu(in->e_perm)); + + if (p != end) + prt_char(out, ' '); + } +} + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +#include "fs.h" + +#include #include #include #include -#include "acl.h" -#include "fs.h" -#include "xattr.h" - static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) { return sizeof(bch_acl_header) + @@ -226,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); if (ret) { if (!bch2_err_matches(ret, ENOENT)) @@ -253,7 +304,7 @@ retry: } xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v), + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) @@ -262,8 +313,8 @@ out: if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return acl; } @@ -303,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; @@ -311,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap, int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); acl = _acl; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -329,30 +379,30 @@ retry: goto btree_err; } - ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); + ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; inode_u.bi_ctime = bch2_current_time(c); inode_u.bi_mode = mode; - ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, 0); + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, 0); btree_err: - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err; - bch2_inode_update_after_write(&trans, inode, &inode_u, + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); err: - bch2_trans_exit(&trans); mutex_unlock(&inode->ei_update_lock); + bch2_trans_put(trans); return ret; } @@ -367,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -377,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, return bch2_err_matches(ret, ENOENT) ? 0 : ret; k = bch2_btree_iter_peek_slot(&iter); - xattr = bkey_s_c_to_xattr(k); + ret = bkey_err(k); if (ret) goto err; + xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index bb21d8d..27e7eec 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -7,8 +7,6 @@ struct bch_hash_info; struct bch_inode_info; struct posix_acl; -#ifdef CONFIG_BCACHEFS_POSIX_ACL - #define BCH_ACL_VERSION 0x0001 typedef struct { @@ -26,6 +24,10 @@ typedef struct { __le32 a_version; } bch_acl_header; +void bch2_acl_to_text(struct printbuf *, const void *, size_t); + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7bf2a50..113273b 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -192,146 +192,109 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + int ret = 0; /* allow for unknown fields */ - if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { - prt_printf(err, "incorrect value size (%zu < %u)", - bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, + alloc_v1_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); +fsck_err: + return ret; } -int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; + int ret = 0; - if (bch2_alloc_unpack_v2(&u, k)) { - prt_printf(err, "unpack error"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; } -int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; + int ret = 0; - if (bch2_alloc_unpack_v3(&u, k)) { - prt_printf(err, "unpack error"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; } -int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - int rw = flags & WRITE; - - if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%u > %lu)", - alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); - return -BCH_ERR_invalid_bkey; - } - - if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { - prt_printf(err, "invalid backpointers_start"); - return -BCH_ERR_invalid_bkey; - } - - if (rw == WRITE && - !(flags & BKEY_INVALID_JOURNAL) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { - unsigned i, bp_len = 0; + int ret = 0; - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) - bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; + bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, + alloc_v4_val_size_bad, + "bad val size (%u > %zu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); - if (bp_len > a.v->dirty_sectors) { - prt_printf(err, "too many backpointers"); - return -BCH_ERR_invalid_bkey; - } - } + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, + alloc_v4_backpointers_start_bad, + "invalid backpointers_start"); - if (rw == WRITE) { - if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { - prt_printf(err, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, + alloc_key_data_type_bad, + "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); - switch (a.v->data_type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - if (a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe) { - prt_printf(err, "empty data type free but have data"); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_sb: - case BCH_DATA_journal: - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - if (!a.v->dirty_sectors) { - prt_printf(err, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_cached: - if (!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe) { - prt_printf(err, "data type inconsistency"); - return -BCH_ERR_invalid_bkey; - } - - if (!a.v->io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { - prt_printf(err, "cached bucket with read_time == 0"); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_stripe: - if (!a.v->stripe) { - prt_printf(err, "data_type %s but stripe==0", - bch2_data_types[a.v->data_type]); - return -BCH_ERR_invalid_bkey; - } - break; - } + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + bkey_fsck_err_on(a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe, c, err, + alloc_key_empty_but_have_data, + "empty data type free but have data"); + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + bkey_fsck_err_on(!a.v->dirty_sectors, c, err, + alloc_key_dirty_sectors_0, + "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); + break; + case BCH_DATA_cached: + bkey_fsck_err_on(!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe, c, err, + alloc_key_cached_inconsistency, + "data type inconsistency"); + + bkey_fsck_err_on(!a.v->io_time[READ] && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + c, err, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time == 0"); + break; + case BCH_DATA_stripe: + break; } - - return 0; -} - -static inline u64 swab40(u64 x) -{ - return (((x & 0x00000000ffULL) << 32)| - ((x & 0x000000ff00ULL) << 16)| - ((x & 0x0000ff0000ULL) >> 0)| - ((x & 0x00ff000000ULL) >> 16)| - ((x & 0xff00000000ULL) >> 32)); +fsck_err: + return ret; } void bch2_alloc_v4_swab(struct bkey_s k) @@ -347,6 +310,7 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); + a->fragmentation_lru = swab64(a->fragmentation_lru); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { @@ -544,17 +508,18 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { - prt_printf(err, "bad val size (%lu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, + bucket_gens_val_size_bad, + "bad val size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); +fsck_err: + return ret; } void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) @@ -571,7 +536,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_alloc_v4 a; @@ -582,9 +547,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) u8 gen; int ret; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { /* * Not a fsck error because this is checked/repaired by @@ -597,10 +560,10 @@ int bch2_bucket_gens_init(struct bch_fs *c) pos = alloc_gens_pos(iter.pos, &offset); if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { - ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; have_bucket_gens_key = false; @@ -614,15 +577,15 @@ int bch2_bucket_gens_init(struct bch_fs *c) g.v.gens[offset] = gen; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (have_bucket_gens_key && !ret) - ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -631,20 +594,19 @@ int bch2_bucket_gens_init(struct bch_fs *c) int bch2_alloc_read(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; int ret; down_read(&c->gc_lock); - bch2_trans_init(&trans, c, 0, 0); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { const struct bch_bucket_gens *g; u64 b; - for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; @@ -668,11 +630,11 @@ int bch2_alloc_read(struct bch_fs *c) b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } else { struct bch_alloc_v4 a; - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { /* * Not a fsck error because this is checked/repaired by @@ -685,10 +647,10 @@ int bch2_alloc_read(struct bch_fs *c) *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); up_read(&c->gc_lock); if (ret) @@ -753,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", - bch2_btree_ids[btree], + bch2_btree_id_str(btree), iter.pos.inode, iter.pos.offset, bch2_bkey_types[old.k->type], @@ -1012,6 +974,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, int ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, + alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) return bch2_btree_delete_at(trans, alloc_iter, 0); @@ -1031,7 +994,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (k.k->type != discard_key_type && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" + fsck_err(c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[discard_key_type], @@ -1061,7 +1025,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (k.k->type != freespace_key_type && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" + fsck_err(c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[freespace_key_type], @@ -1092,7 +1057,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (a->gen != alloc_gen(k, gens_offset) && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" + fsck_err(c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" " %s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), @@ -1150,7 +1116,8 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (k.k->type != KEY_TYPE_set && (c->opts.reconstruct_alloc || - fsck_err(c, "hole in alloc btree missing in freespace btree\n" + fsck_err(c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" " device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, @@ -1213,6 +1180,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, for (i = gens_offset; i < gens_end_offset; i++) { if (fsck_err_on(g.v.gens[i], c, + bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, bucket_gens_pos_to_alloc(k.k->p, i).offset, @@ -1223,15 +1191,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, } if (need_update) { - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g)); + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(k); + ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; - memcpy(k, &g, sizeof(g)); + memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, bucket_gens_iter, k, 0); + ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); if (ret) goto err; } @@ -1270,8 +1238,9 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + need_discard_freespace_key_to_invalid_dev_bucket, "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) + bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) goto delete; a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -1279,9 +1248,10 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a)), c, + need_discard_freespace_key_bad, "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_ids[iter->btree_id], + bch2_btree_id_str(iter->btree_id), iter->pos.inode, iter->pos.offset, a->data_type == state, @@ -1297,7 +1267,7 @@ delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw); goto out; } @@ -1308,7 +1278,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans, if (!btree_id_is_extents(iter->btree_id)) { return __bch2_check_discard_freespace_key(trans, iter); } else { - int ret; + int ret = 0; while (!bkey_eq(iter->pos, end) && !(ret = btree_trans_too_many_iters(trans) ?: @@ -1346,6 +1316,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, dev_exists = bch2_dev_exists2(c, k.k->p.inode); if (!dev_exists) { if (fsck_err_on(!dev_exists, c, + bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1356,6 +1327,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, + bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1364,6 +1336,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, for (b = start; b < ca->mi.first_bucket; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; @@ -1371,21 +1344,21 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, for (b = ca->mi.nbuckets; b < end; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } if (need_update) { - struct bkey_i *k; + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - k = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(k); + ret = PTR_ERR_OR_ZERO(u); if (ret) goto out; - memcpy(k, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, k, 0); + memcpy(u, &g, sizeof(g)); + ret = bch2_trans_update(trans, iter, u, 0); } out: fsck_err: @@ -1395,27 +1368,25 @@ fsck_err: int bch2_check_alloc_info(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; struct bkey hole; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, + bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH); while (1) { struct bpos next; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); k = bch2_get_key_or_real_bucket_hole(&iter, &hole); ret = bkey_err(k); @@ -1428,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c) if (k.k->type) { next = bpos_nosnap_successor(k.k->p); - ret = bch2_check_alloc_key(&trans, + ret = bch2_check_alloc_key(trans, k, &iter, &discard_iter, &freespace_iter, @@ -1438,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(&trans, + ret = bch2_check_alloc_hole_freespace(trans, bkey_start_pos(k.k), &next, &freespace_iter) ?: - bch2_check_alloc_hole_bucket_gens(&trans, + bch2_check_alloc_hole_bucket_gens(trans, bkey_start_pos(k.k), &next, &bucket_gens_iter); @@ -1450,9 +1421,9 @@ int bch2_check_alloc_info(struct bch_fs *c) goto bkey_err; } - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw); if (ret) goto bkey_err; @@ -1463,29 +1434,29 @@ bkey_err: if (ret) break; } - bch2_trans_iter_exit(&trans, &bucket_gens_iter); - bch2_trans_iter_exit(&trans, &freespace_iter); - bch2_trans_iter_exit(&trans, &discard_iter); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &bucket_gens_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + bch2_trans_iter_exit(trans, &iter); if (ret < 0) goto err; - ret = for_each_btree_key2(&trans, iter, + ret = for_each_btree_key2(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: - for_each_btree_key2(&trans, iter, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(&trans, iter, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_bucket_gens_key(&trans, &iter, k)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + bch2_check_bucket_gens_key(trans, &iter, k)); err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); return ret; @@ -1524,11 +1495,13 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, return ret; if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), @@ -1571,10 +1544,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_alloc_to_lru_ref(&trans, &iter))); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + bch2_check_alloc_to_lru_ref(trans, &iter))); if (ret) bch_err_fn(c, ret); return ret; @@ -1682,7 +1655,7 @@ write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1699,29 +1672,25 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; struct bpos discard_pos_done = POS_MAX; int ret; - bch2_trans_init(&trans, c, 0, 0); - /* * We're doing the commit in bch2_discard_one_bucket instead of using * for_each_btree_key_commit() so that we can increment counters after * successful commit: */ - ret = for_each_btree_key2(&trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded)); - - bch2_trans_exit(&trans); + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, + &seen, + &open, + &need_journal_commit, + &discarded))); if (need_journal_commit * 2 > seen) bch2_journal_flush_async(&c->journal, NULL); @@ -1791,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1827,15 +1796,13 @@ static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); struct bch_dev *ca; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush(&trans); + ret = bch2_btree_write_buffer_flush(trans); if (ret) goto err; @@ -1843,11 +1810,11 @@ static void bch2_do_invalidates_work(struct work_struct *work) s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, - invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); + invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); @@ -1855,7 +1822,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) } } err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -1866,34 +1833,36 @@ void bch2_do_invalidates(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } -static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - unsigned long *last_updated) +int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + u64 bucket_start, u64 bucket_end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey hole; - struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets); + struct bpos end = POS(ca->dev_idx, bucket_end); struct bch_member *m; + unsigned long last_updated = jiffies; int ret; - bch2_trans_init(&trans, c, 0, 0); + BUG_ON(bucket_start > bucket_end); + BUG_ON(bucket_end > ca->mi.nbuckets); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), + BTREE_ITER_PREFETCH); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { - if (*last_updated + HZ * 10 < jiffies) { + if (last_updated + HZ * 10 < jiffies) { bch_info(ca, "%s: currently at %llu/%llu", __func__, iter.pos.offset, ca->mi.nbuckets); - *last_updated = jiffies; + last_updated = jiffies; } - bch2_trans_begin(&trans); + bch2_trans_begin(trans); if (bkey_ge(iter.pos, end)) { ret = 0; @@ -1913,10 +1882,10 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(&trans, k, a, true) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + ret = bch2_bucket_do_index(trans, k, a, true) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_lazy_rw| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1924,7 +1893,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, } else { struct bkey_i *freespace; - freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace)); + freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); ret = PTR_ERR_OR_ZERO(freespace); if (ret) goto bkey_err; @@ -1934,10 +1903,10 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, freespace->k.p = k.k->p; freespace->k.size = k.k->size; - ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_lazy_rw| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1950,16 +1919,16 @@ bkey_err: break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); if (ret < 0) { - bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "initializing free space"); return ret; } mutex_lock(&c->sb_lock); - m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); mutex_unlock(&c->sb_lock); @@ -1972,7 +1941,6 @@ int bch2_fs_freespace_init(struct bch_fs *c) unsigned i; int ret = 0; bool doing_init = false; - unsigned long last_updated = jiffies; /* * We can crash during the device add path, so we need to check this on @@ -1988,7 +1956,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, &last_updated); + ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { percpu_ref_put(&ca->ref); bch_err_fn(c, ret); @@ -2109,6 +2077,17 @@ void bch2_recalc_capacity(struct bch_fs *c) closure_wake_up(&c->freelist_wait); } +u64 bch2_min_rw_member_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + u64 ret = U64_MAX; + + for_each_rw_member(ca, c, i) + ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + return ret; +} + static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index c0914fe..73faf99 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -149,13 +149,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -193,7 +193,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -245,9 +245,11 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); } +int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); +u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 1f4c5b3..b85c776 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -25,7 +25,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" @@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { @@ -419,25 +430,38 @@ again: continue; a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) continue; + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + s->buckets_seen++; ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + citer.path->preserve = false; + bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); + alloc_cursor = iter.pos.offset; ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); - if (!ob && alloc_cursor > alloc_start) { - alloc_cursor = alloc_start; + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; goto again; } @@ -502,9 +526,14 @@ again: } /** - * bch_bucket_alloc - allocate a single bucket from a specific device + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object + * @ca: device to allocate from + * @watermark: how important is this allocation? + * @cl: if not NULL, closure to be used to wait if buckets not available + * @usage: for secondarily also returning the current device usage * - * Returns index of bucket on success, 0 on failure + * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, @@ -597,7 +626,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct open_bucket *ob; bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage))); return ob; } @@ -775,7 +804,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; struct open_bucket *ob; - struct bch_dev *ca; unsigned i, ec_idx; int ret = 0; @@ -805,8 +833,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->dev); - ob->ec_idx = ec_idx; ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); @@ -989,7 +1015,6 @@ retry_blocking: cl = _cl; goto retry_blocking; } - } return ret; @@ -1031,6 +1056,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans, return ret < 0 ? ret : 0; } +/** + * should_drop_bucket - check if this is open_bucket should go away + * @ob: open_bucket to predicate on + * @c: filesystem handle + * @ca: if set, we're killing buckets for a particular device + * @ec: if true, we're shutting down erasure coding and killing all ec + * open_buckets + * otherwise, return true + * Returns: true if we should kill this open_bucket + * + * We're killing open_buckets because we're shutting down a device, erasure + * coding, or the entire filesystem - check if this open_bucket matches: + */ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, struct bch_dev *ca, bool ec) { @@ -1516,25 +1554,47 @@ static const char * const bch2_write_point_states[] = { NULL }; +static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, + struct write_point *wp) +{ + struct open_bucket *ob; + unsigned i; + + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + + printbuf_indent_add(out, 2); + open_bucket_for_each(c, &wp->ptrs, ob, i) + bch2_open_bucket_to_text(out, c, ob); + printbuf_indent_sub(out, 2); +} + void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) { struct write_point *wp; - unsigned i; + prt_str(out, "Foreground write points\n"); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) { - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + wp++) + bch2_write_point_to_text(out, c, wp); - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); + prt_str(out, "Copygc write point\n"); + bch2_write_point_to_text(out, c, &c->copygc_write_point); - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } + prt_str(out, "Rebalance write point\n"); + bch2_write_point_to_text(out, c, &c->rebalance_write_point); - prt_newline(out); - } + prt_str(out, "Btree write point\n"); + bch2_write_point_to_text(out, c, &c->btree_write_point); } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index fee195f..7aaeec4 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -5,7 +5,7 @@ #include "bcachefs.h" #include "alloc_types.h" #include "extents.h" -#include "super.h" +#include "sb-members.h" #include diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 804a843..b91b7a4 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -105,7 +105,7 @@ struct write_point { struct dev_stripe_state stripe; u64 sectors_allocated; - } __attribute__((__aligned__(SMP_CACHE_BYTES))); + } __aligned(SMP_CACHE_BYTES); struct { struct work_struct index_update_work; @@ -116,7 +116,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; - } __attribute__((__aligned__(SMP_CACHE_BYTES))); + } __aligned(SMP_CACHE_BYTES); }; struct write_point_specifier { diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 8747c5e..4c8bcf2 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -5,6 +5,7 @@ #include "backpointers.h" #include "btree_cache.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_write_buffer.h" #include "error.h" @@ -37,25 +38,26 @@ static bool extent_matches_bp(struct bch_fs *c, return false; } -int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + int ret = 0; - if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { - prt_str(err, "backpointer at wrong pos"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + c, err, + backpointer_pos_wrong, + "backpointer at wrong pos"); +fsck_err: + return ret; } void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) { prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", - bch2_btree_ids[bp->btree_id], + bch2_btree_id_str(bp->btree_id), bp->level, (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), @@ -76,7 +78,7 @@ void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - bp.v->bucket_offset = swab32(bp.v->bucket_offset); + bp.v->bucket_offset = swab40(bp.v->bucket_offset); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } @@ -219,18 +221,22 @@ out: static void backpointer_not_found(struct btree_trans *trans, struct bpos bp_pos, struct bch_backpointer bp, - struct bkey_s_c k, - const char *thing_it_points_to) + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + /* + * If we're using the btree write buffer, the backpointer we were + * looking at may have already been deleted - failure to find what it + * pointed to is not an error: + */ if (likely(!bch2_backpointers_no_use_write_buffer)) return; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", - thing_it_points_to); + bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); bch2_bpos_to_text(&buf, bucket); prt_printf(&buf, "\n "); @@ -256,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bch_backpointer bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; - - bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, - 0, - min(bp.level, r->level), - iter_flags); - k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (bp.level == r->level + 1) - k = bkey_i_to_s_c(&r->key); - - if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); + if (likely(!bp.level)) { + struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, 0, + iter_flags); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } - if (unlikely(bch2_backpointers_no_use_write_buffer)) { - if (bp.level) { - struct btree *b; + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; - /* - * If a backpointer for a btree node wasn't found, it may be - * because it was overwritten by a new btree node that hasn't - * been written out yet - backpointer_get_node() checks for - * this: - */ - b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); - if (!IS_ERR_OR_NULL(b)) - return bkey_i_to_s_c(&b->key); + bch2_trans_iter_exit(trans, iter); + backpointer_not_found(trans, bp_pos, bp, k); + return bkey_s_c_null; + } else { + struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + if (IS_ERR_OR_NULL(b)) { bch2_trans_iter_exit(trans, iter); - - if (IS_ERR(b)) - return bkey_s_c_err(PTR_ERR(b)); - return bkey_s_c_null; + return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; } - - backpointer_not_found(trans, bp_pos, bp, k, "extent"); + return bkey_i_to_s_c(&b->key); } - - return bkey_s_c_null; } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, @@ -326,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, bp.level - 1, 0); b = bch2_btree_iter_peek_node(iter); - if (IS_ERR(b)) + if (IS_ERR_OR_NULL(b)) goto err; - if (b && extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + BUG_ON(b->c.level != bp.level - 1); + + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) return b; - if (b && btree_node_will_make_reachable(b)) { + if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bp_pos, bp, - bkey_i_to_s_c(&b->key), "btree node"); + backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); b = NULL; } err: @@ -351,20 +339,18 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; - struct bch_dev *ca; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - "backpointer for mising device:\n%s", + backpointer_to_missing_device, + "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bp_pos_to_bucket(c, k.k->p), 0); ret = bkey_err(alloc_k); @@ -372,6 +358,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ goto out; if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -393,10 +380,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - bch2_check_btree_backpointer(&trans, &iter, k))); + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + bch2_check_btree_backpointer(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; @@ -456,39 +443,32 @@ fsck_err: return ret; missing: prt_printf(&buf, "missing backpointer for btree=%s l=%u ", - bch2_btree_ids[bp.btree_id], bp.level); + bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || c->opts.reconstruct_alloc || - fsck_err(c, "%s", buf.buf)) + fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; } static int check_extent_to_backpointers(struct btree_trans *trans, - struct btree_iter *iter, + enum btree_id btree, unsigned level, struct bpos bucket_start, struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bpos_level *last_flushed, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek_all_levels(iter); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket_pos; @@ -497,7 +477,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); ret = check_bp_exists(trans, bucket_pos, bp, k, @@ -514,44 +494,33 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, enum btree_id btree_id, struct bpos bucket_start, struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bpos_level *last_flushed, + int *level) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree_id); struct btree_iter iter; struct btree *b; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; int ret; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); +retry: + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, + 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; - BUG_ON(b != btree_node_root(c, b)); - - k = bkey_i_to_s_c(&b->key); - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; - struct bch_backpointer bp; - - if (p.ptr.cached) - continue; + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry; + } - bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, - k, p, &bucket_pos, &bp); + *level = b->c.level; - ret = check_bp_exists(trans, bucket_pos, bp, k, + k = bkey_i_to_s_c(&b->key); + ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1, bucket_start, bucket_end, - last_flushed); - if (ret) - goto err; - } + last_flushed, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -629,43 +598,49 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; enum btree_id btree_id; - struct bpos_level last_flushed = { UINT_MAX }; + struct bkey_s_c k; + struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; int ret = 0; for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - depth, - BTREE_ITER_ALL_LEVELS| - BTREE_ITER_PREFETCH); - - do { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_extent_to_backpointers(trans, &iter, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; - } while (!bch2_btree_iter_advance(&iter)); - - bch2_trans_iter_exit(trans, &iter); - - if (ret) - break; + int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_lazy_rw| + BCH_TRANS_COMMIT_no_enospc, check_btree_root_to_backpointers(trans, btree_id, bucket_start, bucket_end, - &last_flushed)); + &last_flushed, &level)); if (ret) - break; + return ret; + + while (level >= depth) { + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + level, + BTREE_ITER_PREFETCH); + for_each_btree_key_continue(trans, iter, BTREE_ITER_PREFETCH, k, ret) { + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_lazy_rw| + BCH_TRANS_COMMIT_no_enospc, + check_extent_to_backpointers(trans, btree_id, level, + bucket_start, bucket_end, + &last_flushed, k)); + if (ret) + break; + + if (bpos_eq(iter.pos, SPOS_MAX)) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + --level; + } } - return ret; + + return 0; } static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, @@ -706,7 +681,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, --btree_nodes; if (!btree_nodes) { - *end = alloc_k.k->p; + *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; break; } @@ -726,13 +701,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bpos start = POS_MIN, end; int ret; - bch2_trans_init(&trans, c, 0, 0); while (1) { - ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, start, &end); if (ret) break; @@ -752,13 +726,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); + ret = bch2_check_extents_to_backpointers_pass(trans, start, end); if (ret || bpos_eq(end, SPOS_MAX)) break; start = bpos_successor(end); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -797,7 +771,9 @@ static int check_one_backpointer(struct btree_trans *trans, } if (fsck_err_on(!k.k, c, - "backpointer for missing extent\n %s", + backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); goto out; @@ -819,7 +795,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), &last_flushed_pos)); @@ -827,13 +803,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, int bch2_check_backpointers_to_extents(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; int ret; - bch2_trans_init(&trans, c, 0, 0); while (1) { - ret = bch2_get_btree_in_memory_pos(&trans, + ret = bch2_get_btree_in_memory_pos(trans, (1U << BTREE_ID_extents)| (1U << BTREE_ID_reflink), ~0, @@ -859,13 +834,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) printbuf_exit(&buf); } - ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); + ret = bch2_check_backpointers_to_extents_pass(trans, start, end); if (ret || !bbpos_cmp(end, BBPOS_MAX)) break; start = bbpos_successor(end); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index 547e061..ab866fe 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -7,7 +7,16 @@ #include "buckets.h" #include "super.h" -int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, +static inline u64 swab40(u64 x) +{ + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + +int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, enum bkey_invalid_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/bbpos.h b/libbcachefs/bbpos.h index 1fbed1f..be2edce 100644 --- a/libbcachefs/bbpos.h +++ b/libbcachefs/bbpos.h @@ -2,20 +2,9 @@ #ifndef _BCACHEFS_BBPOS_H #define _BCACHEFS_BBPOS_H +#include "bbpos_types.h" #include "bkey_methods.h" - -struct bbpos { - enum btree_id btree; - struct bpos pos; -}; - -static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -{ - return (struct bbpos) { btree, pos }; -} - -#define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#include "btree_cache.h" static inline int bbpos_cmp(struct bbpos l, struct bbpos r) { @@ -40,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) { - prt_str(out, bch2_btree_ids[pos.btree]); + prt_str(out, bch2_btree_id_str(pos.btree)); prt_char(out, ':'); bch2_bpos_to_text(out, pos.pos); } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index e1f1e8e..3117ab4 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -209,6 +209,7 @@ #include "nocow_locking_types.h" #include "opts.h" #include "recovery_types.h" +#include "sb-errors_types.h" #include "seqmutex.h" #include "util.h" @@ -293,9 +294,17 @@ do { \ printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_fn(_c, _ret) \ - bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) -#define bch_err_msg(_c, _ret, _msg) \ - bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret)) +do { \ + if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ +} while (0) + +#define bch_err_msg(_c, _ret, _msg, ...) \ +do { \ + if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + bch_err(_c, "%s(): error " _msg " %s", __func__, \ + ##__VA_ARGS__, bch2_err_str(_ret)); \ +} while (0) #define bch_verbose(c, fmt, ...) \ do { \ @@ -371,7 +380,7 @@ BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM #ifndef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; +#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; BCH_DEBUG_PARAMS_DEBUG() #undef BCH_DEBUG_PARAM #endif @@ -392,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG() x(journal_flush_write) \ x(journal_noflush_write) \ x(journal_flush_seq) \ - x(blocked_journal) \ + x(blocked_journal_low_on_space) \ + x(blocked_journal_low_on_pin) \ + x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(nocow_lock_contended) @@ -410,6 +421,7 @@ enum bch_time_stats { #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" #include "clock_types.h" +#include "disk_groups_types.h" #include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" @@ -454,6 +466,8 @@ enum gc_phase { GC_PHASE_BTREE_bucket_gens, GC_PHASE_BTREE_snapshot_trees, GC_PHASE_BTREE_deleted_inodes, + GC_PHASE_BTREE_logged_ops, + GC_PHASE_BTREE_rebalance_work, GC_PHASE_PENDING_DELETE, }; @@ -491,6 +505,8 @@ struct bch_dev { * Committed by bch2_write_super() -> bch_fs_mi_update() */ struct bch_member_cpu mi; + atomic64_t errors[BCH_MEMBER_ERROR_NR]; + __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -569,7 +585,7 @@ enum { BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ BCH_FS_NEED_ANOTHER_GC, - BCH_FS_HAVE_DELETED_SNAPSHOTS, + BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, /* errors: */ BCH_FS_ERROR, @@ -603,7 +619,7 @@ struct journal_seq_blacklist_table { u64 start; u64 end; bool dirty; - } entries[0]; + } entries[]; }; struct journal_keys { @@ -626,8 +642,8 @@ struct journal_keys { size_t size; }; -struct btree_path_buf { - struct btree_path *path; +struct btree_trans_buf { + struct btree_trans *trans; }; #define REPLICAS_DELTA_LIST_MAX (1U << 16) @@ -737,6 +753,7 @@ struct bch_fs { struct snapshot_table __rcu *snapshots; size_t snapshot_table_size; struct mutex snapshot_table_lock; + struct rw_semaphore snapshot_create_lock; struct work_struct snapshot_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work; @@ -786,9 +803,9 @@ struct bch_fs { /* btree_iter.c: */ struct seqmutex btree_trans_lock; struct list_head btree_trans_list; - mempool_t btree_paths_pool; + mempool_t btree_trans_pool; mempool_t btree_trans_mem_pool; - struct btree_path_buf __percpu *btree_paths_bufs; + struct btree_trans_buf __percpu *btree_trans_bufs; struct srcu_struct btree_trans_barrier; bool btree_trans_barrier_initialized; @@ -928,9 +945,6 @@ struct bch_fs { struct list_head moving_context_list; struct mutex moving_context_lock; - struct list_head data_progress_list; - struct mutex data_progress_lock; - /* REBALANCE */ struct bch_fs_rebalance rebalance; @@ -981,11 +995,6 @@ struct bch_fs { struct bio_set dio_read_bioset; struct bio_set nocow_flush_bioset; - /* ERRORS */ - struct list_head fsck_errors; - struct mutex fsck_error_lock; - bool fsck_alloc_err; - /* QUOTAS */ struct bch_memquota_type quotas[QTYP_NR]; @@ -995,6 +1004,7 @@ struct bch_fs { enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + u64 recovery_passes_complete; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1033,6 +1043,14 @@ struct bch_fs { struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; + + /* ERRORS */ + struct list_head fsck_error_msgs; + struct mutex fsck_error_msgs_lock; + bool fsck_alloc_msgs_err; + + bch_sb_errors_cpu fsck_error_counts; + struct mutex fsck_error_counts_lock; }; extern struct wait_queue_head bch2_read_only_wait; @@ -1139,22 +1157,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 5ec218e..0a75095 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -83,8 +83,8 @@ typedef uuid_t __uuid_t; #endif #define BITMASK(name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ \ static inline __u64 name(const type *k) \ { \ @@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v) \ } #define LE_BITMASK(_bits, name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ -static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ +static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ \ static inline __u64 name(const type *k) \ { \ @@ -370,7 +370,9 @@ static inline void bkey_init(struct bkey *k) x(backpointer, 28) \ x(inode_v3, 29) \ x(bucket_gens, 30) \ - x(snapshot_tree, 31) + x(snapshot_tree, 31) \ + x(logged_op_truncate, 32) \ + x(logged_op_finsert, 33) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -611,31 +613,17 @@ struct bch_extent_stripe_ptr { #endif }; -struct bch_extent_reservation { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:22, - replicas:4, - generation:32; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 generation:32, - replicas:4, - unused:22, - type:6; -#endif -}; - struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:7, - unused:33, - compression:8, + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ target:16; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 target:16, compression:8, - unused:33, - type:7; + unused:34, + type:6; #endif }; @@ -723,7 +711,7 @@ struct bch_inode { __le64 bi_hash_seed; __le32 bi_flags; __le16 bi_mode; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); struct bch_inode_v2 { @@ -733,7 +721,7 @@ struct bch_inode_v2 { __le64 bi_hash_seed; __le64 bi_flags; __le16 bi_mode; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); struct bch_inode_v3 { @@ -745,7 +733,7 @@ struct bch_inode_v3 { __le64 bi_sectors; __le64 bi_size; __le64 bi_version; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); #define INODEv3_FIELDS_START_INITIAL 6 @@ -836,34 +824,30 @@ enum inode_opt_id { Inode_opt_nr, }; -enum { - /* - * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL - * flags) - */ - __BCH_INODE_SYNC = 0, - __BCH_INODE_IMMUTABLE = 1, - __BCH_INODE_APPEND = 2, - __BCH_INODE_NODUMP = 3, - __BCH_INODE_NOATIME = 4, - - __BCH_INODE_I_SIZE_DIRTY = 5, - __BCH_INODE_I_SECTORS_DIRTY = 6, - __BCH_INODE_UNLINKED = 7, - __BCH_INODE_BACKPTR_UNTRUSTED = 8, - - /* bits 20+ reserved for packed fields below: */ -}; - -#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -916,9 +900,7 @@ struct bch_dirent { #define DT_SUBVOL 16 #define BCH_DT_MAX 17 -#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \ - sizeof(struct bkey) - \ - offsetof(struct bch_dirent, d_name))) +#define BCH_NAME_MAX 512 /* Xattrs */ @@ -1099,20 +1081,20 @@ struct bch_reflink_v { struct bch_val v; __le64 refcount; union bch_extent_entry start[0]; - __u64 _data[0]; + __u64 _data[]; } __packed __aligned(8); struct bch_indirect_inline_data { struct bch_val v; __le64 refcount; - u8 data[0]; + u8 data[]; }; /* Inline data */ struct bch_inline_data { struct bch_val v; - u8 data[0]; + u8 data[]; }; /* Subvolumes: */ @@ -1126,6 +1108,11 @@ struct bch_subvolume { __le32 flags; __le32 snapshot; __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + */ __le32 parent; __le32 pad; bch_le128 otime; @@ -1147,6 +1134,7 @@ struct bch_snapshot { __le32 parent; __le32 children[2]; __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ __le32 tree; __le32 depth; __le32 skip[3]; @@ -1179,6 +1167,33 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +/* Logged operations btree: */ + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1189,7 +1204,7 @@ struct bch_sb_field { #define BCH_SB_FIELDS() \ x(journal, 0) \ - x(members, 1) \ + x(members_v1, 1) \ x(crypt, 2) \ x(replicas_v0, 3) \ x(quota, 4) \ @@ -1198,7 +1213,9 @@ struct bch_sb_field { x(replicas, 7) \ x(journal_seq_blacklist, 8) \ x(journal_v2, 9) \ - x(counters, 10) + x(counters, 10) \ + x(members_v2, 11) \ + x(errors, 12) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1219,7 +1236,7 @@ enum bch_sb_field_type { struct bch_sb_field_journal { struct bch_sb_field field; - __le64 buckets[0]; + __le64 buckets[]; }; struct bch_sb_field_journal_v2 { @@ -1228,13 +1245,38 @@ struct bch_sb_field_journal_v2 { struct bch_sb_field_journal_v2_entry { __le64 start; __le64 nr; - } d[0]; + } d[]; }; -/* BCH_SB_FIELD_members: */ +/* BCH_SB_FIELD_members_v1: */ #define BCH_MIN_NR_NBUCKETS (1 << 6) +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + struct bch_member { __uuid_t uuid; __le64 nbuckets; /* device size */ @@ -1243,17 +1285,23 @@ struct bch_member { __le32 pad; __le64 last_mount; /* time_t */ - __le64 flags[2]; + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; }; -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags[0], 30, 31) + struct bch_member, flags, 30, 31) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -1273,9 +1321,16 @@ enum bch_member_state { BCH_MEMBER_STATE_NR }; -struct bch_sb_field_members { +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { struct bch_sb_field field; - struct bch_member members[0]; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; }; /* BCH_SB_FIELD_crypt: */ @@ -1373,19 +1428,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type) struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[0]; + __u8 devs[]; } __packed; struct bch_sb_field_replicas_v0 { struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[0]; + struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); struct bch_replicas_entry { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[0]; + __u8 devs[]; } __packed; #define replicas_entry_bytes(_i) \ @@ -1393,7 +1448,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[0]; + struct bch_replicas_entry entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_quota: */ @@ -1428,7 +1483,7 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; - struct bch_disk_group entries[0]; + struct bch_disk_group entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_counters */ @@ -1521,7 +1576,7 @@ enum bch_persistent_counters { struct bch_sb_field_counters { struct bch_sb_field field; - __le64 d[0]; + __le64 d[]; }; /* @@ -1535,10 +1590,8 @@ struct jset_entry { __u8 type; /* designates what this jset holds */ __u8 pad[3]; - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; + struct bkey_i start[0]; + __u64 _data[]; }; struct bch_sb_field_clean { @@ -1549,10 +1602,8 @@ struct bch_sb_field_clean { __le16 _write_clock; __le64 journal_seq; - union { - struct jset_entry start[0]; - __u64 _data[0]; - }; + struct jset_entry start[0]; + __u64 _data[]; }; struct journal_seq_blacklist_entry { @@ -1562,13 +1613,20 @@ struct journal_seq_blacklist_entry { struct bch_sb_field_journal_seq_blacklist { struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; - union { - struct journal_seq_blacklist_entry start[0]; - __u64 _data[0]; - }; +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; }; +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + /* Superblock: */ /* @@ -1631,7 +1689,9 @@ struct bch_sb_field_journal_seq_blacklist { x(snapshot_skiplists, BCH_VERSION(1, 1), \ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ x(deleted_inodes, BCH_VERSION(1, 2), \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) + BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ + x(rebalance_work, BCH_VERSION(1, 3), \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1641,7 +1701,8 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_max }; -static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; +static const __maybe_unused +unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1702,10 +1763,8 @@ struct bch_sb { struct bch_sb_layout layout; - union { - struct bch_sb_field start[0]; - __le64 _data[0]; - }; + struct bch_sb_field start[0]; + __le64 _data[]; } __packed __aligned(8); /* @@ -1950,7 +2009,7 @@ enum bch_csum_type { BCH_CSUM_NR }; -static const unsigned bch_crc_bytes[] = { +static const __maybe_unused unsigned bch_crc_bytes[] = { [BCH_CSUM_none] = 0, [BCH_CSUM_crc32c_nonzero] = 4, [BCH_CSUM_crc32c] = 4, @@ -2182,10 +2241,8 @@ struct jset { __le64 last_seq; - union { - struct jset_entry start[0]; - __u64 _data[0]; - }; + struct jset_entry start[0]; + __u64 _data[]; } __packed __aligned(8); LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); @@ -2199,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); enum btree_id_flags { BTREE_ID_EXTENTS = BIT(0), BTREE_ID_SNAPSHOTS = BIT(1), - BTREE_ID_DATA = BIT(2), + BTREE_ID_SNAPSHOT_FIELD = BIT(2), + BTREE_ID_DATA = BIT(3), }; #define BCH_BTREE_IDS() \ @@ -2254,8 +2312,13 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_bucket_gens)) \ x(snapshot_trees, 15, 0, \ BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ - BIT_ULL(KEY_TYPE_set)) + x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)) \ + x(logged_ops, 17, 0, \ + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ + BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, @@ -2290,10 +2353,8 @@ struct bset { __le16 version; __le16 u64s; /* count of d[] in u64s */ - union { - struct bkey_packed start[0]; - __u64 _data[0]; - }; + struct bkey_packed start[0]; + __u64 _data[]; } __packed __aligned(8); LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index ee7ba70..abdb055 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -7,14 +7,6 @@ #include "bset.h" #include "util.h" -#undef EBUG_ON - -#ifdef DEBUG_BKEYS -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) -#endif - const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; void bch2_bkey_packed_to_binary_text(struct printbuf *out, @@ -135,7 +127,7 @@ static void pack_state_finish(struct pack_state *state, struct bkey_packed *k) { EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= k->_data + state->format->key_u64s); + EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); *state->p = state->w; } @@ -184,6 +176,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field) return v + offset; } +__always_inline +static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + + if (bits) { + if (bits > state->bits) { + bits -= state->bits; + /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + } +} + __always_inline static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) { @@ -198,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) if (fls64(v) > bits) return false; - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - + __set_inc_field(state, field, v); return true; } @@ -307,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format, /** * bch2_bkey_pack_key -- pack just the key, not the value + * @out: packed result + * @in: key to pack + * @format: format of packed result + * + * Returns: true on success, false on failure */ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) + const struct bkey_format *format) { struct pack_state state = pack_state_init(format, out); u64 *w = out->_data; @@ -335,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, /** * bch2_bkey_unpack -- unpack the key and the value + * @b: btree node of @src key (for packed format) + * @dst: unpacked result + * @src: packed input */ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) + const struct bkey_packed *src) { __bkey_unpack_key(b, &dst->k, src); @@ -348,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, /** * bch2_bkey_pack -- pack the key and the value + * @dst: packed result + * @src: unpacked input + * @format: format of packed result + * + * Returns: true on success, false on failure */ -bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, - const struct bkey_format *format) +bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, + const struct bkey_format *format) { struct bkey_packed tmp; - if (!bch2_bkey_pack_key(&tmp, &in->k, format)) + if (!bch2_bkey_pack_key(&tmp, &src->k, format)) return false; - memmove_u64s((u64 *) out + format->key_u64s, - &in->v, - bkey_val_u64s(&in->k)); - memcpy_u64s_small(out, &tmp, format->key_u64s); + memmove_u64s((u64 *) dst + format->key_u64s, + &src->v, + bkey_val_u64s(&src->k)); + memcpy_u64s_small(dst, &tmp, format->key_u64s); return true; } @@ -380,19 +394,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ret = false; } - if (bits > state->bits) { - bits -= state->bits; - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - + __set_inc_field(state, field, v); return ret; } @@ -435,6 +437,24 @@ static bool bkey_packed_successor(struct bkey_packed *out, return false; } + +static bool bkey_format_has_too_big_fields(const struct bkey_format *f) +{ + for (unsigned i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) + return true; + } + + return false; +} #endif /* @@ -515,7 +535,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0); + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); } #endif @@ -583,8 +604,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) /* allow for extent merging: */ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { - ret.bits_per_field[BKEY_FIELD_SIZE] += 4; - bits += 4; + unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); + + ret.bits_per_field[BKEY_FIELD_SIZE] += b; + bits += b; } ret.key_u64s = DIV_ROUND_UP(bits, 64); @@ -604,40 +627,74 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } - EBUG_ON(bch2_bkey_format_validate(&ret)); +#ifdef CONFIG_BCACHEFS_DEBUG + { + struct printbuf buf = PRINTBUF; + + BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); + printbuf_exit(&buf); + } +#endif return ret; } -const char *bch2_bkey_format_validate(struct bkey_format *f) +int bch2_bkey_format_invalid(struct bch_fs *c, + struct bkey_format *f, + enum bkey_invalid_flags flags, + struct printbuf *err) { unsigned i, bits = KEY_PACKED_BITS_START; - if (f->nr_fields != BKEY_NR_FIELDS) - return "incorrect number of fields"; + if (f->nr_fields != BKEY_NR_FIELDS) { + prt_printf(err, "incorrect number of fields: got %u, should be %u", + f->nr_fields, BKEY_NR_FIELDS); + return -BCH_ERR_invalid; + } /* * Verify that the packed format can't represent fields larger than the * unpacked format: */ for (i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) - : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) - return "field too large"; + if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) { + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, field_offset, unpacked_max); + return -BCH_ERR_invalid; + } + } bits += f->bits_per_field[i]; } - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) - return "incorrect key_u64s"; + if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { + prt_printf(err, "incorrect key_u64s: got %u, should be %u", + f->key_u64s, DIV_ROUND_UP(bits, 64)); + return -BCH_ERR_invalid; + } - return NULL; + return 0; +} + +void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) +{ + prt_printf(out, "u64s %u fields ", f->key_u64s); + + for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%u:%llu", + f->bits_per_field[i], + le64_to_cpu(f->field_offset[i])); + } } /* diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index e81fb3e..831be01 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -9,6 +9,12 @@ #include "util.h" #include "vstructs.h" +enum bkey_invalid_flags { + BKEY_INVALID_WRITE = (1U << 0), + BKEY_INVALID_COMMIT = (1U << 1), + BKEY_INVALID_JOURNAL = (1U << 2), +}; + #if 0 /* @@ -46,7 +52,7 @@ struct bkey_s { static inline struct bkey_i *bkey_next(struct bkey_i *k) { - return (struct bkey_i *) (k->_data + k->k.u64s); + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); } #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) @@ -86,19 +92,15 @@ enum bkey_lr_packed { #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) -#define bkey_copy(_dst, _src) \ -do { \ - BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ - !type_is(_dst, struct bkey_packed *)); \ - BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ - !type_is(_src, struct bkey_packed *)); \ - EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ - (u64 *) (_dst) < (u64 *) (_src) + \ - ((struct bkey *) (_src))->u64s); \ - \ - memcpy_u64s_small((_dst), (_src), \ - ((struct bkey *) (_src))->u64s); \ -} while (0) +static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) +{ + memcpy_u64s_small(dst, src, src->u64s); +} + +static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) +{ + memcpy_u64s_small(dst, src, src->k.u64s); +} struct btree; @@ -391,7 +393,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format, } #define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) + ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) extern const struct bkey_format bch2_bkey_format_current; @@ -726,7 +728,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f) #error edit for your odd byteorder. #endif -#define high_word(f, k) ((k)->_data + high_word_offset(f)) +#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) #define next_word(p) nth_word(p, 1) #define prev_word(p) nth_word(p, -1) @@ -769,6 +771,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -const char *bch2_bkey_format_validate(struct bkey_format *); +int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 90557f4..761f5e3 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "backpointers.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" @@ -10,9 +11,11 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "io_misc.h" #include "lru.h" #include "quota.h" #include "reflink.h" +#include "snapshot.h" #include "subvolume.h" #include "xattr.h" @@ -23,8 +26,8 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) +static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -37,24 +40,25 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, .key_invalid = deleted_key_invalid, \ }) -static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) +static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_bytes(k.k)) { - prt_printf(err, "incorrect value size (%zu != 0)", - bkey_val_bytes(k.k)); - return -BCH_ERR_invalid_bkey; - } - - return 0; + int ret = 0; + + bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, + bkey_val_size_nonzero, + "incorrect value size (%zu != 0)", + bkey_val_bytes(k.k)); +fsck_err: + return ret; } #define bch2_bkey_ops_error ((struct bkey_ops) { \ .key_invalid = empty_val_key_invalid, \ }) -static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) +static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -68,8 +72,8 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, .key_invalid = empty_val_key_invalid, \ }) -static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) +static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -89,18 +93,6 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ }) -static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) -{ - if (bkey_val_bytes(k.k)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_cookie)); - return -BCH_ERR_invalid_bkey; - } - - return 0; -} - static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { bch2_key_resize(l.k, l.k->size + r.k->size); @@ -108,7 +100,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = key_type_set_invalid, \ + .key_invalid = empty_val_key_invalid, \ .key_merge = key_type_set_merge, \ }) @@ -119,7 +111,6 @@ const struct bkey_ops bch2_bkey_ops[] = { }; const struct bkey_ops bch2_bkey_null_ops = { - .min_val_size = U8_MAX, }; int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -127,84 +118,95 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + int ret = 0; - if (bkey_val_bytes(k.k) < ops->min_val_size) { - prt_printf(err, "bad val size (%zu < %u)", - bkey_val_bytes(k.k), ops->min_val_size); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, + bkey_val_size_too_small, + "bad val size (%zu < %u)", + bkey_val_bytes(k.k), ops->min_val_size); if (!ops->key_invalid) return 0; - return ops->key_invalid(c, k, flags, err); + ret = ops->key_invalid(c, k, flags, err); +fsck_err: + return ret; } static u64 bch2_key_types_allowed[] = { -#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, - BCH_BTREE_IDS() -#undef x [BKEY_TYPE_btree] = BIT_ULL(KEY_TYPE_deleted)| BIT_ULL(KEY_TYPE_btree_ptr)| BIT_ULL(KEY_TYPE_btree_ptr_v2), +#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, + BCH_BTREE_IDS() +#undef x }; +const char *bch2_btree_node_type_str(enum btree_node_type type) +{ + return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); +} + int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->u64s < BKEY_U64s) { - prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - if (flags & BKEY_INVALID_COMMIT && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { - prt_printf(err, "invalid key type for btree %s (%s)", - bch2_btree_ids[type], bch2_bkey_types[k.k->type]); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, + bkey_u64s_too_small, + "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - if (k.k->size == 0) { - prt_printf(err, "size == 0"); - return -BCH_ERR_invalid_bkey; - } + if (type >= BKEY_TYPE_NR) + return 0; - if (k.k->size > k.k->p.offset) { - prt_printf(err, "size greater than offset (%u > %llu)", - k.k->size, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, + bkey_invalid_type_for_btree, + "invalid key type for btree %s (%s)", + bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + bkey_fsck_err_on(k.k->size == 0, c, err, + bkey_extent_size_zero, + "size == 0"); + + bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, + bkey_extent_size_greater_than_offset, + "size greater than offset (%u > %llu)", + k.k->size, k.k->p.offset); } else { - if (k.k->size) { - prt_printf(err, "size != 0"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->size, c, err, + bkey_size_nonzero, + "size != 0"); } if (type != BKEY_TYPE_btree) { - if (!btree_type_has_snapshots((enum btree_id) type) && - k.k->p.snapshot) { - prt_printf(err, "nonzero snapshot"); - return -BCH_ERR_invalid_bkey; - } - - if (btree_type_has_snapshots((enum btree_id) type) && - !k.k->p.snapshot) { - prt_printf(err, "snapshot == 0"); - return -BCH_ERR_invalid_bkey; + enum btree_id btree = type - 1; + + if (btree_type_has_snapshots(btree)) { + bkey_fsck_err_on(!k.k->p.snapshot, c, err, + bkey_snapshot_zero, + "snapshot == 0"); + } else if (!btree_type_has_snapshot_field(btree)) { + bkey_fsck_err_on(k.k->p.snapshot, c, err, + bkey_snapshot_nonzero, + "nonzero snapshot"); + } else { + /* + * btree uses snapshot field but it's not required to be + * nonzero + */ } - if (bkey_eq(k.k->p, POS_MAX)) { - prt_printf(err, "key at POS_MAX"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, + bkey_at_pos_max, + "key at POS_MAX"); } - - return 0; +fsck_err: + return ret; } int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -216,20 +218,20 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, bch2_bkey_val_invalid(c, k, flags, err); } -int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, - struct printbuf *err) +int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, struct printbuf *err) { - if (bpos_lt(k.k->p, b->data->min_key)) { - prt_printf(err, "key before start of btree node"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - if (bpos_gt(k.k->p, b->data->max_key)) { - prt_printf(err, "key past end of btree node"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, + bkey_before_start_of_btree_node, + "key before start of btree node"); - return 0; + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, + bkey_after_end_of_btree_node, + "key past end of btree node"); +fsck_err: + return ret; } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) @@ -367,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, { const struct bkey_ops *ops; struct bkey uk; - struct bkey_s u; unsigned nr_compat = 5; int i; @@ -432,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, } break; - case 4: + case 4: { + struct bkey_s u; + if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { @@ -449,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, if (ops->compat) ops->compat(btree_id, version, big_endian, write, u); break; + } default: BUG(); } diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index d7b6376..912adad 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -13,12 +13,6 @@ enum btree_node_type; extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), -}; - /* * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. @@ -27,7 +21,7 @@ enum bkey_invalid_flags { * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, + int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -61,7 +55,8 @@ int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, enum bkey_invalid_flags, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, enum bkey_invalid_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, + struct bkey_s_c, struct printbuf *); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); @@ -98,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans, enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_PREJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -113,7 +107,6 @@ enum btree_update_flags { #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) @@ -125,16 +118,6 @@ enum btree_update_flags { #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ - ((1U << KEY_TYPE_alloc)| \ - (1U << KEY_TYPE_alloc_v2)| \ - (1U << KEY_TYPE_alloc_v3)| \ - (1U << KEY_TYPE_alloc_v4)| \ - (1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)| \ - (1U << KEY_TYPE_inode_v2)| \ - (1U << KEY_TYPE_snapshot)) - static inline int bch2_trans_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index b9aa027..bcca9e7 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -106,7 +106,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, while ((k = sort_iter_peek(iter))) { if (!bkey_deleted(k) && !should_drop_next_key(iter)) { - bkey_copy(out, k); + bkey_p_copy(out, k); btree_keys_account_key_add(&nr, 0, out); out = bkey_p_next(out); } @@ -137,7 +137,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, continue; if (!transform) - bkey_copy(out, in); + bkey_p_copy(out, in); else if (bch2_bkey_transform(out_f, out, bkey_packed(in) ? in_f : &bch2_bkey_format_current, in)) out->format = KEY_FORMAT_LOCAL_BTREE; @@ -191,7 +191,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); set_bkeyp_val_u64s(f, out, 0); } else { - bkey_copy(out, in); + bkey_p_copy(out, in); } out->needs_whiteout |= needs_whiteout; out = bkey_p_next(out); diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h index 79cf11d..7c0f0b1 100644 --- a/libbcachefs/bkey_sort.h +++ b/libbcachefs/bkey_sort.h @@ -9,14 +9,24 @@ struct sort_iter { struct sort_iter_set { struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; + } data[]; }; -static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) +static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) { iter->b = b; iter->used = 0; - iter->size = ARRAY_SIZE(iter->data); + iter->size = size; +} + +struct sort_iter_stack { + struct sort_iter iter; + struct sort_iter_set sets[MAX_BSETS + 1]; +}; + +static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) +{ + sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); } static inline void sort_iter_add(struct sort_iter *iter, diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index bcdf28f..bb73ba9 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, printk(KERN_ERR "iter was:"); btree_node_iter_for_each(_iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k2); printk(" [%zi %zi]", t - b->set, - k->_data - bset(b, t)->_data); + k2->_data - bset(b, t)->_data); } panic("\n"); } @@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); - struct bkey_packed *next = (void *) (where->_data + clobber_u64s); + struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; #if 0 @@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx) } struct ro_aux_tree { - struct bkey_float f[0]; + u8 nothing[0]; + struct bkey_float f[]; }; struct rw_aux_tree { @@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, { unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; - return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); + return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); } static struct rw_aux_tree *rw_aux_tree(const struct btree *b, @@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b, btree_keys_account_key_add(&b->nr, t - b->set, src); if (src->u64s != clobber_u64s) { - u64 *src_p = where->_data + clobber_u64s; - u64 *dst_p = where->_data + src->u64s; + u64 *src_p = (u64 *) where->_data + clobber_u64s; + u64 *dst_p = (u64 *) where->_data + src->u64s; EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < (int) clobber_u64s - src->u64s); @@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b, unsigned clobber_u64s) { struct bset_tree *t = bset_tree_last(b); - u64 *src_p = where->_data + clobber_u64s; + u64 *src_p = (u64 *) where->_data + clobber_u64s; u64 *dst_p = where->_data; bch2_bset_verify_rw_aux_tree(b, t); @@ -1188,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_RO_AUX_TREE: return bset_search_tree(b, t, search, lossy_packed_search); default: - unreachable(); + BUG(); } } @@ -1268,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, } /** - * bch_btree_node_iter_init - initialize a btree node iterator, starting from a + * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a * given position * + * @iter: iterator to initialize + * @b: btree node to search + * @search: search key + * * Main entry point to the lookup code for individual btree nodes: * * NOTE: diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 13c88d9..84636ad 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -525,7 +525,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; bc->shrink.seeks = 4; - ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); + ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name); if (ret) goto err; @@ -795,7 +795,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ - if (trans && sync) + if (path && sync) bch2_trans_unlock_noassert(trans); bch2_btree_node_read(c, b, sync); @@ -832,12 +832,12 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) "btree node header doesn't match ptr\n" "btree %s level %u\n" "ptr: ", - bch2_btree_ids[b->c.btree_id], b->c.level); + bch2_btree_id_str(b->c.btree_id), b->c.level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_printf(&buf, "\nheader: btree %s level %llu\n" "min ", - bch2_btree_ids[BTREE_NODE_ID(b->data)], + bch2_btree_id_str(BTREE_NODE_ID(b->data)), BTREE_NODE_LEVEL(b->data)); bch2_bpos_to_text(&buf, b->data->min_key); @@ -934,7 +934,7 @@ retry: } if (unlikely(need_relock)) { - int ret = bch2_trans_relock(trans) ?: + ret = bch2_trans_relock(trans) ?: bch2_btree_path_relock_intent(trans, path); if (ret) { six_unlock_type(&b->c.lock, lock_type); @@ -965,11 +965,20 @@ retry: } /** - * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * + * @trans: btree transaction object + * @path: btree_path being traversed + * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) + * @level: level of btree node being looked up (0 == leaf node) + * @lock_type: SIX_LOCK_read or SIX_LOCK_intent + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) + * * The btree node will have either a read or a write lock held, depending on * the @write parameter. + * + * Returns: btree node or ERR_PTR() */ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, @@ -1016,28 +1025,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = six_lock_seq(&b->c.lock); - six_unlock_type(&b->c.lock, lock_type); - bch2_trans_unlock(trans); - - bch2_btree_node_wait_on_read(b); - - /* - * should_be_locked is not set on this path yet, so we need to - * relock it specifically: - */ - if (trans) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } - - if (!six_relock_type(&b->c.lock, lock_type, seq)) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); } prefetch(b->aux_data); @@ -1211,10 +1200,22 @@ wait_on_io: six_unlock_intent(&b->c.lock); } -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - const struct btree *b) +const char *bch2_btree_id_str(enum btree_id btree) +{ + return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; +} + +void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_id_str(b->c.btree_id), + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - const struct bkey_format *f = &b->format; struct bset_stats stats; memset(&stats, 0, sizeof(stats)); @@ -1228,9 +1229,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, ":\n" " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); + + prt_printf(out, + " format: "); + bch2_bkey_format_to_text(out, &b->format); - prt_printf(out, "\n" - " format: u64s %u fields %u %u %u %u %u\n" + prt_printf(out, " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" " sib u64s: %u, %u (merge threshold %u)\n" @@ -1238,12 +1243,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " nr unpacked keys %u\n" " floats %zu\n" " failed unpacked %zu\n", - f->key_u64s, - f->bits_per_field[0], - f->bits_per_field[1], - f->bits_per_field[2], - f->bits_per_field[3], - f->bits_per_field[4], b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), btree_bytes(c) - sizeof(struct btree_node), diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 00c9b92..bfe1d74 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -123,8 +123,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) return bch2_btree_id_root(c, b->c.btree_id)->b; } -void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, - const struct btree *); +const char *bch2_btree_id_str(enum btree_id); +void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 49e9822..c4922bd 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" @@ -43,7 +44,7 @@ static bool should_restart_for_topology_repair(struct bch_fs *c) { return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); + !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); } static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) @@ -94,15 +95,15 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_ids[b->c.btree_id], b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + btree_node_topology_bad_min_key, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; @@ -121,14 +122,12 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); bch2_bpos_to_text(&buf2, node_end); - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, + if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); @@ -286,10 +285,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, cur->data->min_key), c, + btree_node_topology_overwritten_by_next_node, "btree node overwritten by next node at btree %s level %u:\n" " node %s\n" " next %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = DROP_PREV_NODE; goto out; @@ -297,10 +297,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, bpos_predecessor(cur->data->min_key)), c, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " node %s\n" " next %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) ret = set_node_max(c, prev, bpos_predecessor(cur->data->min_key)); @@ -309,20 +310,22 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(bpos_ge(expected_start, cur->data->max_key), c, + btree_node_topology_overwritten_by_prev_node, "btree node overwritten by prev node at btree %s level %u:\n" " prev %s\n" " node %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = DROP_THIS_NODE; goto out; } if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, + btree_node_topology_bad_min_key, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " node %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) ret = set_node_min(c, cur, expected_start); } @@ -343,10 +346,11 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b, bch2_bpos_to_text(&buf2, b->key.k.p); if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = set_node_max(c, child, b->key.k.p); if (ret) @@ -395,9 +399,10 @@ again: bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); if (mustfix_fsck_err_on(ret == -EIO, c, + btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); @@ -503,9 +508,10 @@ again: bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (mustfix_fsck_err_on(!have_child, c, + btree_node_topology_interior_node_empty, "empty interior btree node at btree %s level %u\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf)) ret = DROP_THIS_NODE; err: @@ -528,14 +534,12 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree *b; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) { + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) @@ -545,8 +549,8 @@ int bch2_check_topology(struct bch_fs *c) if (btree_node_fake(b)) continue; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(&trans, b); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(trans, b); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { @@ -555,7 +559,7 @@ int bch2_check_topology(struct bch_fs *c) } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -565,8 +569,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bkey_s_c *k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry_c; struct extent_ptr_decoded p = { 0 }; bool do_update = false; struct printbuf buf = PRINTBUF; @@ -576,14 +580,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * XXX * use check_bucket_ref here */ - bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); if (!g->gen_valid && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + fsck_err(c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -600,7 +605,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (gen_cmp(p.ptr.gen, g->gen) > 0 && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -621,7 +627,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -632,7 +639,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + fsck_err(c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -646,6 +654,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (fsck_err_on(bucket_data_type(g->data_type) && bucket_data_type(g->data_type) != data_type, c, + ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -665,6 +674,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -673,6 +683,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id do_update = true; if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -812,6 +823,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, goto err; if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + bkey_version_in_future, "key version number higher than recorded: %llu > %llu", k->k->version.lo, atomic64_read(&c->key_version))) @@ -969,9 +981,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b FSCK_CAN_FIX| FSCK_CAN_IGNORE| FSCK_NO_RATELIMIT, + btree_node_read_error, "Unreadable btree node at btree %s level %u:\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level - 1, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && @@ -1026,6 +1039,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, + btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1035,6 +1049,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, + btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1067,15 +1082,13 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - if (initial) - trans.is_initial_gc = true; + trans->is_initial_gc = true; for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; @@ -1083,22 +1096,22 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) for (i = 0; i < BTREE_ID_NR && !ret; i++) ret = initial - ? bch2_gc_btree_init(&trans, ids[i], metadata_only) - : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + ? bch2_gc_btree_init(trans, ids[i], metadata_only) + : bch2_gc_btree(trans, ids[i], initial, metadata_only); for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { if (!bch2_btree_id_root(c, i)->alive) continue; ret = initial - ? bch2_gc_btree_init(&trans, i, metadata_only) - : bch2_gc_btree(&trans, i, initial, metadata_only); + ? bch2_gc_btree_init(trans, i, metadata_only) + : bch2_gc_btree(trans, i, initial, metadata_only); } if (ret < 0) bch_err_fn(c, ret); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1213,24 +1226,16 @@ static int bch2_gc_done(struct bch_fs *c, percpu_down_write(&c->mark_lock); -#define copy_field(_f, _msg, ...) \ +#define copy_field(_err, _f, _msg, ...) \ if (dst->_f != src->_f && \ (!verify || \ - fsck_err(c, _msg ": got %llu, should be %llu" \ + fsck_err(c, _err, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f -#define copy_stripe_field(_f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, "stripe %zu has wrong "_msg \ - ": got %u, should be %u", \ - iter.pos, ##__VA_ARGS__, \ - dst->_f, src->_f))) \ - dst->_f = src->_f -#define copy_dev_field(_f, _msg, ...) \ - copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) -#define copy_fs_field(_f, _msg, ...) \ - copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) +#define copy_dev_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); @@ -1241,14 +1246,18 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, dev_usage_u64s()); - copy_dev_field(buckets_ec, "buckets_ec"); - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + copy_dev_field(dev_usage_buckets_wrong, + d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(dev_usage_sectors_wrong, + d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(dev_usage_fragmented_wrong, + d[i].fragmented, "%s fragmented", bch2_data_types[i]); } - }; + + copy_dev_field(dev_usage_buckets_ec_wrong, + buckets_ec, "buckets_ec"); + } { unsigned nr = fs_usage_u64s(c); @@ -1256,17 +1265,24 @@ static int bch2_gc_done(struct bch_fs *c, struct bch_fs_usage *src = (void *) bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); - copy_fs_field(hidden, "hidden"); - copy_fs_field(btree, "btree"); + copy_fs_field(fs_usage_hidden_wrong, + hidden, "hidden"); + copy_fs_field(fs_usage_btree_wrong, + btree, "btree"); if (!metadata_only) { - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); + copy_fs_field(fs_usage_data_wrong, + data, "data"); + copy_fs_field(fs_usage_cached_wrong, + cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], "persistent_reserved[%i]", i); } @@ -1282,7 +1298,8 @@ static int bch2_gc_done(struct bch_fs *c, printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, e); - copy_fs_field(replicas[i], "%s", buf.buf); + copy_fs_field(fs_usage_replicas_wrong, + replicas[i], "%s", buf.buf); } } @@ -1418,6 +1435,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (c->opts.reconstruct_alloc || fsck_err_on(new.data_type != gc.data_type, c, + alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", iter->pos.inode, iter->pos.offset, @@ -1426,9 +1444,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, bch2_data_types[gc.data_type])) new.data_type = gc.data_type; -#define copy_bucket_field(_f) \ +#define copy_bucket_field(_errtype, _f) \ if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, \ + fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -1437,11 +1455,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new._f, gc._f)) \ new._f = gc._f; \ - copy_bucket_field(gen); - copy_bucket_field(dirty_sectors); - copy_bucket_field(cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); + copy_bucket_field(alloc_key_gen_wrong, + gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, + dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, + cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, + stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, + stripe_redundancy); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -1468,37 +1491,35 @@ fsck_err: static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - for_each_member_device(ca, c, i) { - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(&trans, &iter, k, metadata_only)); + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + bch2_alloc_write_key(trans, &iter, k, metadata_only)); if (ret < 0) { - bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); percpu_ref_put(&ca->ref); break; } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret < 0 ? ret : 0; } static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { struct bch_dev *ca; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bucket *g; @@ -1514,17 +1535,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - return -BCH_ERR_ENOMEM_gc_alloc_start; + ret = -BCH_ERR_ENOMEM_gc_alloc_start; + goto err; } buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; rcu_assign_pointer(ca->buckets_gc, buckets); - }; - - bch2_trans_init(&trans, c, 0, 0); + } - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); @@ -1545,13 +1565,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) g->stripe_redundancy = a->stripe_redundancy; } } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - + bch2_trans_iter_exit(trans, &iter); +err: + bch2_trans_put(trans); if (ret) - bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); - + bch_err_fn(c, ret); return ret; } @@ -1574,7 +1592,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) g->dirty_sectors = 0; g->cached_sectors = 0; } - }; + } } static int bch2_gc_write_reflink_key(struct btree_trans *trans, @@ -1603,6 +1621,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, } if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + reflink_v_refcount_wrong, "reflink key has wrong refcount:\n" " %s\n" " should be %u", @@ -1626,7 +1645,7 @@ fsck_err: static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; size_t idx = 0; @@ -1635,23 +1654,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx)); c->reflink_gc_nr = 0; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; @@ -1660,10 +1679,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c, if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1681,9 +1700,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c, r->size = k.k->size; r->refcount = 0; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1728,7 +1747,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, if (bad) bch2_bkey_val_to_text(&buf, c, k); - if (fsck_err_on(bad, c, "%s", buf.buf)) { + if (fsck_err_on(bad, c, stripe_sector_count_wrong, + "%s", buf.buf)) { struct bkey_i_stripe *new; new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); @@ -1750,7 +1770,7 @@ fsck_err: static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -1758,15 +1778,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_stripes_key(&trans, &iter, k)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_stripes_key(trans, &iter, k)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1778,6 +1798,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) /** * bch2_gc - walk _all_ references to buckets, and recompute them: * + * @c: filesystem object + * @initial: are we in recovery? + * @metadata_only: are we just checking metadata references, or everything? + * + * Returns: 0 on success, or standard errcode on failure + * * Order matters here: * - Concurrent GC relies on the fact that we have a total ordering for * everything that GC walks - see gc_will_visit_node(), @@ -1946,7 +1972,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i int bch2_gc_gens(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; @@ -1964,22 +1990,20 @@ int bch2_gc_gens(struct bch_fs *c) trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); for_each_member_device(ca, c, i) { - struct bucket_gens *gens; + struct bucket_gens *gens = bucket_gens(ca); BUG_ON(ca->oldest_gen); - ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { percpu_ref_put(&ca->ref); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } - gens = bucket_gens(ca); - for (b = gens->first_bucket; b < gens->nbuckets; b++) ca->oldest_gen[b] = gens->b[b]; @@ -1987,33 +2011,31 @@ int bch2_gc_gens(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (btree_type_has_ptrs(i)) { - struct btree_iter iter; - struct bkey_s_c k; - c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = for_each_btree_key_commit(&trans, iter, i, + + ret = for_each_btree_key_commit(trans, iter, i, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_NOFAIL, - gc_btree_gens_key(&trans, &iter, k)); + BCH_TRANS_COMMIT_no_enospc, + gc_btree_gens_key(trans, &iter, k)); if (ret && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) goto err; } - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(&trans, &iter, k)); + BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_oldest_gen(trans, &iter, k)); if (ret && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) goto err; @@ -2030,7 +2052,7 @@ err: ca->oldest_gen = NULL; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); up_read(&c->gc_lock); mutex_unlock(&c->gc_gens_lock); return ret; @@ -2085,7 +2107,7 @@ static int bch2_gc_thread(void *arg) ret = bch2_gc_gens(c); #endif if (ret < 0) - bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); debug_check_no_locks_held(); } @@ -2115,7 +2137,7 @@ int bch2_gc_thread_start(struct bch_fs *c) p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); if (IS_ERR(p)) { - bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); + bch_err_fn(c, PTR_ERR(p)); return PTR_ERR(p); } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index b45e382..607575f 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BTREE_GC_H #define _BCACHEFS_BTREE_GC_H +#include "bkey.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index c049876..1f73ee0 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -14,9 +14,10 @@ #include "debug.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "recovery.h" #include "super-io.h" #include "trace.h" @@ -105,8 +106,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, vpfree(p, size); } -static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, - bool *used_mempool) +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, + bool *used_mempool) { unsigned flags = memalloc_nofs_save(); void *p; @@ -114,7 +115,7 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -122,8 +123,6 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, memalloc_nofs_restore(flags); return p; } -#define btree_bounce_alloc(_c, _size, _used_mempool) \ - alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) static void sort_bkey_ptrs(const struct btree *bt, struct bkey_packed **ptrs, unsigned nr) @@ -185,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) k = new_whiteouts; while (ptrs != ptrs_end) { - bkey_copy(k, *ptrs); + bkey_p_copy(k, *ptrs); k = bkey_p_next(k); ptrs++; } @@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) n = bkey_p_next(k); if (!bkey_deleted(k)) { - bkey_copy(out, k); + bkey_p_copy(out, k); out = bkey_p_next(out); } else { BUG_ON(k->needs_whiteout); @@ -293,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bool filter_whiteouts) { struct btree_node *out; - struct sort_iter sort_iter; + struct sort_iter_stack sort_iter; struct bset_tree *t; struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; @@ -302,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; - sort_iter_init(&sort_iter, b); + sort_iter_stack_init(&sort_iter, b); for (t = b->set + start_idx; t < b->set + end_idx; t++) { u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, btree_bkey_first(b, t), btree_bkey_last(b, t)); } @@ -321,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); @@ -337,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_bset->journal_seq = cpu_to_le64(seq); if (sorting_entire_node) { - unsigned u64s = le16_to_cpu(out->keys.u64s); + u64s = le16_to_cpu(out->keys.u64s); BUG_ON(bytes != btree_bytes(c)); @@ -411,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_verify_btree_nr_keys(dst); } -#define SORT_CRIT (4096 / sizeof(u64)) - /* * We're about to add another bset to the btree node, so if there's currently * too many bsets - sort some of them together: @@ -513,16 +510,6 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bch2_trans_node_reinit_iter(trans, b); } -static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) -{ - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_ids[b->c.btree_id], - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -} - static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, @@ -535,7 +522,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); - btree_pos_to_text(out, c, b); + bch2_btree_pos_to_text(out, c, b); prt_printf(out, "\n node offset %u", b->written); if (i) @@ -543,42 +530,19 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -enum btree_err_type { - /* - * We can repair this locally, and we're after the checksum check so - * there's no need to try another replica: - */ - BTREE_ERR_FIXABLE, - /* - * We can repair this if we have to, but we should try reading another - * replica if we can: - */ - BTREE_ERR_WANT_RETRY, - /* - * Read another replica if we have one, otherwise consider the whole - * node bad: - */ - BTREE_ERR_MUST_RETRY, - BTREE_ERR_BAD_NODE, - BTREE_ERR_INCOMPATIBLE, -}; - -enum btree_validate_ret { - BTREE_RETRY_READ = 64, -}; - -static int __btree_err(enum btree_err_type type, +__printf(9, 10) +static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, int write, bool have_retry, + enum bch_sb_error_id err_type, const char *fmt, ...) { struct printbuf out = PRINTBUF; va_list args; - int ret = -BCH_ERR_fsck_fix; btree_err_msg(&out, c, ca, b, i, b->written, write); @@ -594,27 +558,32 @@ static int __btree_err(enum btree_err_type type, goto out; } - if (!have_retry && type == BTREE_ERR_WANT_RETRY) - type = BTREE_ERR_FIXABLE; - if (!have_retry && type == BTREE_ERR_MUST_RETRY) - type = BTREE_ERR_BAD_NODE; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) + ret = -BCH_ERR_btree_node_read_err_fixable; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) + ret = -BCH_ERR_btree_node_read_err_bad_node; + + if (ret != -BCH_ERR_btree_node_read_err_fixable) + bch2_sb_error_count(c, err_type); - switch (type) { - case BTREE_ERR_FIXABLE: - mustfix_fsck_err(c, "%s", out.buf); + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore) + goto fsck_err; ret = -BCH_ERR_fsck_fix; break; - case BTREE_ERR_WANT_RETRY: - case BTREE_ERR_MUST_RETRY: + case -BCH_ERR_btree_node_read_err_want_retry: + case -BCH_ERR_btree_node_read_err_must_retry: bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = BTREE_RETRY_READ; break; - case BTREE_ERR_BAD_NODE: + case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); bch2_topology_error(c); ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; break; - case BTREE_ERR_INCOMPATIBLE: + case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; @@ -627,12 +596,17 @@ fsck_err: return ret; } -#define btree_err(type, c, ca, b, i, msg, ...) \ +#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + BCH_FSCK_ERR_##_err_type, \ + msg, ##__VA_ARGS__); \ \ - if (_ret != -BCH_ERR_fsck_fix) \ + if (_ret != -BCH_ERR_fsck_fix) { \ + ret = _ret; \ goto fsck_err; \ + } \ + \ *saw_error = true; \ }) @@ -646,9 +620,6 @@ __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; for_each_bset(b, t) { struct bset *i = bset(b, t); @@ -684,6 +655,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); BUG_ON(bpos_gt(k.k->p, b->data->max_key)); @@ -696,19 +670,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); - const char *err; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; btree_err_on(!bch2_version_compatible(version), - BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); if (btree_err_on(version < c->sb.version_min, - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { mutex_lock(&c->sb_lock); @@ -719,7 +696,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { mutex_lock(&c->sb_lock); @@ -729,11 +708,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; ret = 0; @@ -741,12 +724,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(offset && !i->u64s, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_empty, "empty bset"); - btree_err_on(BSET_OFFSET(i) && - BSET_OFFSET(i) != offset, - BTREE_ERR_WANT_RETRY, c, ca, b, i, + btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_wrong_sector_offset, "bset at wrong sector offset"); if (!offset) { @@ -760,16 +746,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_level, "incorrect level"); if (!write) @@ -786,7 +778,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), @@ -795,7 +789,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); @@ -804,10 +800,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); - err = bch2_bkey_format_validate(&bn->format); - btree_err_on(err, - BTREE_ERR_BAD_NODE, c, ca, b, i, - "invalid bkey format: %s", err); + btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), + -BCH_ERR_btree_node_read_err_bad_node, + c, ca, b, i, + btree_node_bad_format, + "invalid bkey format: %s\n %s", buf1.buf, + (printbuf_reset(&buf2), + bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); + printbuf_reset(&buf1); compat_bformat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -826,7 +826,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, struct printbuf *err) { return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } @@ -847,14 +847,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bkey tmp; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -873,12 +877,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, printbuf_reset(&buf); if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { printbuf_reset(&buf); - prt_printf(&buf, "invalid bkey: "); bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "invalid bkey: %s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -902,7 +908,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_dump_bset(c, b, i, 0); - if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { + if (btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_out_of_order, + "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); @@ -931,7 +940,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; @@ -941,51 +949,65 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->written = 0; iter = mempool_alloc(&c->fill_iter, GFP_NOFS); - sort_iter_init(iter, b); - iter->size = (btree_blocks(c) + 1) * 2; + sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); if (bch2_meta_read_fault("btree")) - btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + btree_err(-BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); - btree_err_on(!b->data->keys.seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, - "bad btree header: seq 0"); - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; btree_err_on(b->data->keys.seq != bp->seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, "got wrong btree node (seq %llx want %llx)", b->data->keys.seq, bp->seq); + } else { + btree_err_on(!b->data->keys.seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, + "bad btree header: seq 0"); } while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; struct nonce nonce; - struct bch_csum csum; bool first = !b->written; + bool csum_bad; if (!b->written) { i = &b->data->keys; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, ca, b, i, - "unknown checksum type %llu", - BSET_CSUM_TYPE(i)); + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - btree_err_on(bch2_crc_cmp(csum, b->data->csum), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + csum_bad = bch2_crc_cmp(b->data->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -995,7 +1017,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_incompatible, + c, NULL, b, NULL, + btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -1007,15 +1031,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, break; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, ca, b, i, - "unknown checksum type %llu", - BSET_CSUM_TYPE(i)); + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - btree_err_on(bch2_crc_cmp(csum, bne->csum), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + csum_bad = bch2_crc_cmp(bne->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -1048,12 +1078,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, true); btree_err_on(blacklisted && first, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); @@ -1066,13 +1100,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_add(iter, vstruct_idx(i, 0), vstruct_last(i)); - - nonblacklisted_written = b->written; } if (ptr_written) { btree_err_on(b->written < ptr_written, - BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); } else { @@ -1083,20 +1117,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, !bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), true), - BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_bset_after_end, "found bset signature after last bset"); - - /* - * Blacklisted bsets are those that were written after the most recent - * (flush) journal write. Since there wasn't a flush, they may not have - * made it to all devices - which means we shouldn't write new bsets - * after them, as that could leave a gap and then reads from that device - * wouldn't find all the bsets in that btree node - which means it's - * important that we start writing new bsets after the most recent _non_ - * blacklisted bset: - */ - blacklisted_written = b->written; - b->written = nonblacklisted_written; } sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); @@ -1137,7 +1161,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "%s", buf.buf); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1164,9 +1191,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); - if (ca->mi.state != BCH_MEMBER_STATE_rw) + if (ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } @@ -1177,7 +1204,8 @@ out: printbuf_exit(&buf); return retry_read; fsck_err: - if (ret == BTREE_RETRY_READ) + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) retry_read = 1; else set_btree_node_read_error(b); @@ -1216,8 +1244,9 @@ static void btree_node_read_work(struct work_struct *work) } start: printbuf_reset(&buf); - btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); @@ -1247,19 +1276,17 @@ start: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); - printbuf_exit(&buf); if (saw_error && !btree_node_read_error(b)) { - struct printbuf buf = PRINTBUF; - + printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", - __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); - printbuf_exit(&buf); + __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); bch2_btree_node_rewrite_async(c, b); } + printbuf_exit(&buf); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1363,14 +1390,20 @@ static void btree_node_read_all_replicas_done(struct closure *cl) } written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - BTREE_ERR_FIXABLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - BTREE_ERR_FIXABLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; @@ -1565,7 +1598,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct printbuf buf = PRINTBUF; prt_str(&buf, "btree node read error: no device to read from\n at "); - btree_pos_to_text(&buf, c, b); + bch2_btree_pos_to_text(&buf, c, b); bch_err(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && @@ -1669,8 +1702,7 @@ err: int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, const struct bkey_i *k, unsigned level) { - return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); - + return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, @@ -1732,15 +1764,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) static void btree_node_write_done(struct bch_fs *c, struct btree *b) { - struct btree_trans trans; - - bch2_trans_init(&trans, c, 0, 0); + struct btree_trans *trans = bch2_trans_get(c); - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); - bch2_trans_exit(&trans); + bch2_trans_put(trans); } static void btree_node_write_work(struct work_struct *work) @@ -1769,11 +1799,11 @@ static void btree_node_write_work(struct work_struct *work) } } else { ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, + bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_reclaim| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW, + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw, !wbio->wbio.failed.nr)); if (ret) goto err; @@ -1803,7 +1833,8 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); @@ -1874,7 +1905,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - struct sort_iter sort_iter; + struct sort_iter_stack sort_iter; struct nonce nonce; unsigned bytes_to_write, sectors_to_write, bytes, u64s; u64 seq = 0; @@ -1947,7 +1978,7 @@ do_write: bch2_sort_whiteouts(c, b); - sort_iter_init(&sort_iter, b); + sort_iter_stack_init(&sort_iter, b); bytes = !b->written ? sizeof(struct btree_node) @@ -1962,7 +1993,7 @@ do_write: continue; bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, btree_bkey_first(b, t), btree_bkey_last(b, t)); seq = max(seq, le64_to_cpu(i->journal_seq)); @@ -1991,14 +2022,14 @@ do_write: i->journal_seq = cpu_to_le64(seq); i->u64s = 0; - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, unwritten_whiteouts_start(c, b), unwritten_whiteouts_end(c, b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; - u64s = bch2_sort_keys(i->start, &sort_iter, false); + u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); le16_add_cpu(&i->u64s, u64s); BUG_ON(!b->written && i->u64s != b->data->keys.u64s); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 0cadf65..7e03dd7 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -7,7 +7,7 @@ #include "btree_locking.h" #include "checksum.h" #include "extents.h" -#include "io_types.h" +#include "io_write_types.h" struct bch_fs; struct btree_write; @@ -143,8 +143,8 @@ enum btree_write_flags { __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, __BTREE_WRITE_ALREADY_STARTED, }; -#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) -#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) +#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) +#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index dfb77b2..3128695 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -5,6 +5,7 @@ #include "bkey_buf.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update.h" @@ -12,9 +13,8 @@ #include "error.h" #include "extents.h" #include "journal.h" -#include "recovery.h" #include "replicas.h" -#include "subvolume.h" +#include "snapshot.h" #include "trace.h" #include @@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && - !btree_type_has_snapshots(iter->btree_id)); + !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) bch2_btree_path_verify(trans, iter->update_path); @@ -362,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, bch2_bpos_to_text(&buf, pos); panic("not locked: %s %s%s\n", - bch2_btree_ids[id], buf.buf, + bch2_btree_id_str(id), buf.buf, key_cache ? " cached" : ""); } @@ -488,7 +488,6 @@ fixup_done: if (!bch2_btree_node_iter_end(node_iter) && iter_current_key_modified && b->c.level) { - struct bset_tree *t; struct bkey_packed *k, *k2, *p; k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -689,7 +688,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) if (t != BTREE_NODE_UNLOCKED) { btree_node_unlock(trans, path, b->c.level); six_lock_increment(&b->c.lock, (enum six_lock_type) t); - mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); + mark_btree_node_locked(trans, path, b->c.level, t); } bch2_btree_path_level_init(trans, path, b); @@ -764,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans, for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; - mark_btree_node_locked(trans, path, path->level, lock_type); + mark_btree_node_locked(trans, path, path->level, + (enum btree_node_locked_type) lock_type); bch2_btree_path_level_init(trans, path, b); return 0; } @@ -936,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); - mark_btree_node_locked(trans, path, level, lock_type); + mark_btree_node_locked(trans, path, level, + (enum btree_node_locked_type) lock_type); path->level = level; bch2_btree_path_level_init(trans, path, b); @@ -1008,7 +1009,7 @@ retry_all: /* * We used to assert that all paths had been traversed here * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since - * path->Should_be_locked is not set yet, we we might have unlocked and + * path->should_be_locked is not set yet, we might have unlocked and * then failed to relock a path - that's fine. */ err: @@ -1108,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, if (unlikely(ret)) goto out; + if (unlikely(!trans->srcu_held)) + bch2_trans_srcu_lock(trans); + /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: @@ -1210,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, bool intent, unsigned long ip, int cmp) { - unsigned level = path->level; - bch2_trans_verify_not_in_restart(trans); EBUG_ON(!path->ref); @@ -1227,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - level = btree_path_up_until_good_node(trans, path, cmp); + unsigned level = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, level)) { struct btree_path_level *l = &path->l[level]; @@ -1341,14 +1343,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p __bch2_path_free(trans, path); } -void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) { panic("trans->restart_count %u, should be %u, last restarted by %pS\n", trans->restart_count, restart_count, (void *) trans->last_begin_ip); } -void bch2_trans_in_restart_error(struct btree_trans *trans) +void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), @@ -1370,7 +1372,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) struct bkey_s_c old = { &i->old_k, i->old_v }; prt_printf(buf, "update: btree=%s cached=%u %pS", - bch2_btree_ids[i->btree_id], + bch2_btree_id_str(i->btree_id), i->cached, (void *) i->ip_allocated); prt_newline(buf); @@ -1386,7 +1388,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) trans_for_each_wb_update(trans, wb) { prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_ids[wb->btree], + bch2_btree_id_str(wb->btree), (void *) i->ip_allocated); prt_newline(buf); @@ -1415,7 +1417,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) path->idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - bch2_btree_ids[path->btree_id], + bch2_btree_id_str(path->btree_id), path->level); bch2_bpos_to_text(out, path->pos); @@ -1493,7 +1495,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) static noinline void btree_path_overflow(struct btree_trans *trans) { bch2_dump_trans_paths_updates(trans); - panic("trans path oveflow\n"); + panic("trans path overflow\n"); } static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, @@ -1522,6 +1524,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; + path->alloc_seq++; btree_path_list_add(trans, pos, path); trans->paths_sorted = false; @@ -1597,7 +1600,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); return path; } @@ -1794,23 +1797,15 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; - } else { - if (!btree_path_node(iter->path, iter->path->level)) - return true; + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); - iter->advanced = true; - return false; - } + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -2046,8 +2041,12 @@ out: } /** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position + * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys less than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) { @@ -2057,7 +2056,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); if (iter->update_path) { @@ -2184,102 +2182,11 @@ end: } /** - * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal - * to iterator's current position, returning keys from every level of the btree. - * For keys at different levels of the btree that compare equal, the key from - * the lower level (leaf) is returned first. - */ -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - struct bkey_s_c k; - int ret; - - EBUG_ON(iter->path->cached); - bch2_btree_iter_verify(iter); - BUG_ON(iter->path->level < iter->min_depth); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - /* Already at end? */ - if (!btree_path_node(iter->path, iter->path->level)) { - k = bkey_s_c_null; - goto out_no_locked; - } - - k = btree_path_level_peek_all(trans->c, - &iter->path->l[iter->path->level], &iter->k); - - /* Check if we should go up to the parent node: */ - if (!k.k || - (iter->advanced && - bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { - iter->pos = path_l(iter->path)->b->key.k.p; - btree_path_set_level_up(trans, iter->path); - iter->advanced = false; - continue; - } - - /* - * Check if we should go back down to a leaf: - * If we're not in a leaf node, we only return the current key - * if it exactly matches iter->pos - otherwise we first have to - * go back to the leaf: - */ - if (iter->path->level != iter->min_depth && - (iter->advanced || - !k.k || - !bpos_eq(iter->pos, k.k->p))) { - btree_path_set_level_down(trans, iter->path, iter->min_depth); - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - /* Check if we should go to the next key: */ - if (iter->path->level == iter->min_depth && - iter->advanced && - k.k && - bpos_eq(iter->pos, k.k->p)) { - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - if (iter->advanced && - iter->path->level == iter->min_depth && - !bpos_eq(k.k->p, iter->pos)) - iter->advanced = false; - - BUG_ON(iter->advanced); - BUG_ON(!k.k); - break; - } - - iter->pos = k.k->p; - btree_path_set_should_be_locked(iter->path); -out_no_locked: - bch2_btree_iter_verify(iter); - - return k; -} - -/** - * bch2_btree_iter_next: returns first key greater than iterator's current + * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { @@ -2290,8 +2197,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) } /** - * bch2_btree_iter_peek_prev: returns first key less than or equal to + * bch2_btree_iter_peek_prev() - returns first key less than or equal to * iterator's current position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { @@ -2414,8 +2324,11 @@ out_no_locked: } /** - * bch2_btree_iter_prev: returns first key less than iterator's current + * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { @@ -2434,7 +2347,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ @@ -2722,7 +2634,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) void bch2_trans_iter_init_outlined(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, + enum btree_id btree_id, struct bpos pos, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, @@ -2738,9 +2650,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - flags |= BTREE_ITER_NOT_EXTENTS; - flags |= __BTREE_ITER_ALL_SNAPSHOTS; - flags |= BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_NOT_EXTENTS; + flags |= __BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_ALL_SNAPSHOTS; bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, __bch2_btree_iter_flags(trans, btree_id, flags), @@ -2812,24 +2724,44 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } -static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) +static inline void check_srcu_held_too_long(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct btree_path *path; + WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), + "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", + (jiffies - trans->srcu_lock_time) / HZ); +} - trans_for_each_path(trans, path) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); +void bch2_trans_srcu_unlock(struct btree_trans *trans) +{ + if (trans->srcu_held) { + struct bch_fs *c = trans->c; + struct btree_path *path; - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_held = false; + } +} + +void bch2_trans_srcu_lock(struct btree_trans *trans) +{ + if (!trans->srcu_held) { + trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; + } } /** * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset * + * Returns: current restart counter, to be used with trans_was_restarted() + * * While iterating over nodes or updating nodes a attempt to lock a btree node * may return BCH_ERR_transaction_restart when the trylock fails. When this * occurs bch2_trans_begin() should be called and the transaction retried. @@ -2875,8 +2807,9 @@ u32 bch2_trans_begin(struct btree_trans *trans) } trans->last_begin_time = now; - if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_reset_srcu_lock(trans); + if (unlikely(trans->srcu_held && + time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; if (trans->restarted) { @@ -2887,26 +2820,23 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) +static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) { - size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; - size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; - void *p = NULL; + struct btree_trans *trans; - BUG_ON(trans->used_mempool); + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) + return trans; + } -#ifdef __KERNEL__ - p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); -#endif - if (!p) - p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); /* - * paths need to be zeroed, bch2_check_for_deadlock looks at paths in - * other threads + * paths need to be zeroed, bch2_check_for_deadlock looks at + * paths in other threads */ - - trans->paths = p; p += paths_bytes; - trans->updates = p; p += updates_bytes; + memset(&trans->paths, 0, sizeof(trans->paths)); + return trans; } const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; @@ -2926,13 +2856,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn) return i; } -void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) +struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { + struct btree_trans *trans; struct btree_transaction_stats *s; bch2_assert_btree_nodes_not_locked(); + trans = bch2_trans_alloc(c); + memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) @@ -2944,8 +2877,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); closure_init_stack(&trans->ref); - bch2_trans_alloc_paths(trans, c); - s = btree_trans_stats(trans); if (s && s->max_mem) { unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); @@ -2965,8 +2896,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ trans->wb_updates_size = s->wb_updates_size; } - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; + trans->srcu_held = true; if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { struct btree_trans *pos; @@ -2991,6 +2923,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ list_add_done: seqmutex_unlock(&c->btree_trans_lock); } + + return trans; } static void check_btree_paths_leaked(struct btree_trans *trans) @@ -3008,14 +2942,14 @@ leaked: trans_for_each_path(trans, path) if (path->ref) printk(KERN_ERR " btree %s %pS\n", - bch2_btree_ids[path->btree_id], + bch2_btree_id_str(path->btree_id), (void *) path->ip_allocated); /* Be noisy about this: */ bch2_fatal_error(c); #endif } -void bch2_trans_exit(struct btree_trans *trans) +void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { struct btree_insert_entry *i; @@ -3041,9 +2975,10 @@ void bch2_trans_exit(struct btree_trans *trans) check_btree_paths_leaked(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - - bch2_journal_preres_put(&c->journal, &trans->journal_preres); + if (trans->srcu_held) { + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + } kfree(trans->extra_journal_entries.data); @@ -3061,18 +2996,11 @@ void bch2_trans_exit(struct btree_trans *trans) else kfree(trans->mem); -#ifdef __KERNEL__ - /* - * Userspace doesn't have a real percpu implementation: - */ - trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); -#endif - - if (trans->paths) - mempool_free(trans->paths, &c->btree_paths_pool); - - trans->mem = (void *) 0x1; - trans->paths = (void *) 0x1; + /* Userspace doesn't have a real percpu implementation: */ + if (IS_ENABLED(__KERNEL__)) + trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); + if (trans) + mempool_free(trans, &c->btree_trans_pool); } static void __maybe_unused @@ -3090,7 +3018,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, prt_tab(out); prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_ids[b->btree_id]); + b->level, bch2_btree_id_str(b->btree_id)); bch2_bpos_to_text(out, btree_node_pos(b)); prt_tab(out); @@ -3120,7 +3048,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) path->idx, path->cached ? 'c' : 'b', path->level, - bch2_btree_ids[path->btree_id]); + bch2_btree_id_str(path->btree_id)); bch2_bpos_to_text(out, path->pos); prt_newline(out); @@ -3150,6 +3078,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) void bch2_fs_btree_iter_exit(struct bch_fs *c) { struct btree_transaction_stats *s; + struct btree_trans *trans; + int cpu; + + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); + if (trans) + panic("%s leaked btree_trans\n", trans->fn); + + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); + free_percpu(c->btree_trans_bufs); for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); @@ -3161,13 +3100,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) if (c->btree_trans_barrier_initialized) cleanup_srcu_struct(&c->btree_trans_barrier); mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_paths_pool); + mempool_exit(&c->btree_trans_pool); } int bch2_fs_btree_iter_init(struct bch_fs *c) { struct btree_transaction_stats *s; - unsigned nr = BTREE_ITER_MAX; int ret; for (s = c->btree_transaction_stats; @@ -3180,9 +3118,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); seqmutex_init(&c->btree_trans_lock); - ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, - sizeof(struct btree_path) * nr + - sizeof(struct btree_insert_entry) * nr) ?: + c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); + if (!c->btree_trans_bufs) + return -ENOMEM; + + ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, + sizeof(struct btree_trans)) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX) ?: init_srcu_struct(&c->btree_trans_barrier); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index c472aa8..e5b989a 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, struct btree_iter *, struct bpos); @@ -258,14 +274,17 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); +void bch2_trans_unlock_long(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); -static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) +static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { - return restart_count != trans->restart_count; + return restart_count != trans->restart_count + ? -BCH_ERR_transaction_restart_nested + : 0; } -void bch2_trans_restart_error(struct btree_trans *, u32); +void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) @@ -274,7 +293,7 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, bch2_trans_restart_error(trans, restart_count); } -void bch2_trans_in_restart_error(struct btree_trans *); +void __noreturn bch2_trans_in_restart_error(struct btree_trans *); static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) { @@ -329,8 +348,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); - static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { return bch2_btree_iter_peek_upto(iter, SPOS_MAX); @@ -389,15 +406,12 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (flags & BTREE_ITER_ALL_LEVELS) - flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && - btree_node_type_is_extents(btree_id)) + btree_id_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && - !btree_type_has_snapshots(btree_id)) + !btree_type_has_snapshot_field(btree_id)) flags &= ~BTREE_ITER_ALL_SNAPSHOTS; if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && @@ -447,7 +461,7 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, } void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - unsigned, struct bpos, unsigned); + enum btree_id, struct bpos, unsigned); static inline void bch2_trans_iter_init(struct btree_trans *trans, struct btree_iter *iter, @@ -561,6 +575,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ KEY_TYPE_##_type, sizeof(*_val), _val) +void bch2_trans_srcu_unlock(struct btree_trans *); +void bch2_trans_srcu_lock(struct btree_trans *); + u32 bch2_trans_begin(struct btree_trans *); /* @@ -584,8 +601,6 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - BUG_ON(flags & BTREE_ITER_ALL_LEVELS); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } @@ -593,8 +608,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter * static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : - flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -656,17 +670,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, #define lockrestart_do(_trans, _do) \ ({ \ u32 _restart_count; \ - int _ret; \ + int _ret2; \ \ do { \ _restart_count = bch2_trans_begin(_trans); \ - _ret = (_do); \ - } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ + _ret2 = (_do); \ + } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ \ - if (!_ret) \ + if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret; \ + _ret2; \ }) /* @@ -681,26 +695,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, #define nested_lockrestart_do(_trans, _do) \ ({ \ u32 _restart_count, _orig_restart_count; \ - int _ret; \ + int _ret2; \ \ _restart_count = _orig_restart_count = (_trans)->restart_count; \ \ - while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ + while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ _restart_count = bch2_trans_begin(_trans); \ \ - if (!_ret) \ + if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ - _ret = -BCH_ERR_transaction_restart_nested; \ - \ - _ret; \ + _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) #define for_each_btree_key2(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -708,15 +719,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, while (1) { \ u32 _restart_count = bch2_trans_begin(_trans); \ \ - _ret = 0; \ + _ret3 = 0; \ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ if (!(_k).k) \ break; \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_advance(&(_iter))) \ @@ -724,13 +735,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -738,15 +749,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, while (1) { \ u32 _restart_count = bch2_trans_begin(_trans); \ \ - _ret = 0; \ + _ret3 = 0; \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ if (!(_k).k) \ break; \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_advance(&(_iter))) \ @@ -754,13 +765,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -769,14 +780,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, u32 _restart_count = bch2_trans_begin(_trans); \ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ if (!(_k).k) { \ - _ret = 0; \ + _ret3 = 0; \ break; \ } \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_rewind(&(_iter))) \ @@ -784,7 +795,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key_commit(_trans, _iter, _btree_id, \ @@ -900,21 +911,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); -void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); -void bch2_trans_exit(struct btree_trans *); + +struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); +void bch2_trans_put(struct btree_trans *); extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *); -#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ -do { \ +#define bch2_trans_get(_c) \ +({ \ static unsigned trans_fn_idx; \ \ if (unlikely(!trans_fn_idx)) \ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ - \ - __bch2_trans_init(_trans, _c, trans_fn_idx); \ -} while (0) + __bch2_trans_get(_c, trans_fn_idx); \ +}) void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index f7c001d..8e80a5b 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - if (ck->c.lock.readers) + if (ck->c.lock.readers) { list_move_tail(&ck->list, &bc->freed_pcpu); - else + bc->nr_freed_pcpu++; + } else { list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, { struct bkey_cached *pos; + bc->nr_freed_nonpcpu++; + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, pos->btree_trans_barrier_seq)) { @@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, #else mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; mutex_unlock(&bc->lock); #endif } else { @@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, f->nr < ARRAY_SIZE(f->objs) / 2) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; f->objs[f->nr++] = ck; } @@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_nonpcpu)) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; } mutex_unlock(&bc->lock); #endif @@ -243,8 +251,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, } if (ck) { - int ret; - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); if (unlikely(ret)) { bkey_cached_move_to_freelist(bc, ck); @@ -253,7 +259,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, path->l[0].b = (void *) ck; path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); ret = bch2_btree_node_lock_write(trans, path, &ck->c); if (unlikely(ret)) { @@ -327,11 +333,11 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_ids[path->btree_id]); + bch2_btree_id_str(path->btree_id)); return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); } - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; @@ -410,7 +416,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[ck->key.btree_id], new_u64s); + bch2_btree_id_str(ck->key.btree_id), new_u64s); ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; goto err; } @@ -479,7 +485,7 @@ retry: if (!ck) goto retry; - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -497,7 +503,8 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); } path->l[0].lock_seq = six_lock_seq(&ck->c.lock); @@ -511,7 +518,7 @@ fill: * path->uptodate yet: */ if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1)) { + !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); goto err; @@ -579,7 +586,8 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); } path->l[0].lock_seq = six_lock_seq(&ck->c.lock); @@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| (ck->journal.seq == journal_last_seq(j) ? BCH_WATERMARK_reclaim : 0)| @@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; bch2_journal_pin_drop(j, &ck->journal); - bch2_journal_preres_put(j, &ck->res); BUG_ON(!btree_node_locked(c_iter.path, 0)); @@ -705,13 +712,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); key = ck->key; if (ck->journal.seq != seq || @@ -728,13 +733,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = commit_do(&trans, NULL, NULL, 0, - btree_key_cache_flush_pos(&trans, key, seq, - BTREE_INSERT_JOURNAL_RECLAIM, false)); + ret = commit_do(trans, NULL, NULL, 0, + btree_key_cache_flush_pos(trans, key, seq, + BCH_TRANS_COMMIT_journal_reclaim, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -765,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - int difference; - - BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - - difference = jset_u64s(insert->k.u64s) - ck->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - ck->res.u64s += difference; - } - } - bkey_copy(ck->k, insert); ck->valid = true; @@ -854,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ + scanned += bc->nr_freed_nonpcpu; + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -863,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_nonpcpu--; } if (scanned >= nr) goto out; + scanned += bc->nr_freed_pcpu; + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -879,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_pcpu--; } if (scanned >= nr) @@ -987,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #endif + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_nonpcpu, &items); @@ -996,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); - bch2_journal_preres_put(&c->journal, &ck->res); list_del(&ck->list); kfree(ck->k); @@ -1058,14 +1057,14 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->shrink.count_objects = bch2_btree_key_cache_count; bc->shrink.scan_objects = bch2_btree_key_cache_scan; bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; - if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) + if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); + prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); prt_newline(out); prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); prt_newline(out); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 0b0f9d6..59c57c5 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -430,7 +430,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, static inline bool btree_path_get_locks(struct btree_trans *trans, struct btree_path *path, - bool upgrade) + bool upgrade, + struct get_locks_fail *f) { unsigned l = path->level; int fail_idx = -1; @@ -441,8 +442,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - fail_idx = l; + : bch2_btree_node_relock(trans, path, l))) { + fail_idx = l; + + if (f) { + f->l = l; + f->b = path->l[l].b; + } + } l++; } while (l < path->locks_want); @@ -583,7 +590,9 @@ __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - return btree_path_get_locks(trans, path, false); + struct get_locks_fail f; + + return btree_path_get_locks(trans, path, false, &f); } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -599,22 +608,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans, bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, struct btree_path *path, - unsigned new_locks_want) + unsigned new_locks_want, + struct get_locks_fail *f) { EBUG_ON(path->locks_want >= new_locks_want); path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true); + return btree_path_get_locks(trans, path, true, f); } bool __bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, - unsigned new_locks_want) + unsigned new_locks_want, + struct get_locks_fail *f) { struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; /* @@ -643,7 +654,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true); + btree_path_get_locks(trans, linked, true, NULL); } return false; @@ -653,7 +664,10 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - unsigned l; + unsigned l, old_locks_want = path->locks_want; + + if (trans->restarted) + return; EBUG_ON(path->locks_want < new_locks_want); @@ -673,6 +687,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } bch2_btree_path_verify_locks(path); + + path->downgrade_seq++; + trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } /* Btree transaction locking: */ @@ -681,6 +698,9 @@ void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + if (trans->restarted) + return; + trans_for_each_path(trans, path) bch2_btree_path_downgrade(trans, path); } @@ -739,6 +759,12 @@ void bch2_trans_unlock(struct btree_trans *trans) bch2_assert_btree_nodes_not_locked(); } +void bch2_trans_unlock_long(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + bch2_trans_srcu_unlock(trans); +} + bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index ce3c7d9..11b0a2c 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -10,9 +10,8 @@ * updating the iterator state */ -#include - #include "btree_iter.h" +#include "six.h" void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); @@ -92,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, static inline void mark_btree_node_locked(struct btree_trans *trans, struct btree_path *path, unsigned level, - enum six_lock_type type) + enum btree_node_locked_type type) { mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS @@ -356,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ + +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned); + struct btree_path *, unsigned, + struct get_locks_fail *); + bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); + struct btree_path *, unsigned, + struct get_locks_fail *); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { + struct get_locks_fail f; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) : path->uptodate == BTREE_ITER_UPTODATE) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want); + old_locks_want, new_locks_want, &f); return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 6b6333d..e58575a 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -4,14 +4,14 @@ #include #include -#include -//#include "bkey_methods.h" +#include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" #include "errcode.h" #include "journal_types.h" #include "replicas_types.h" +#include "six.h" struct open_bucket; struct btree_update; @@ -194,34 +194,33 @@ struct btree_node_iter { /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -static const u16 BTREE_ITER_SLOTS = 1 << 0; -static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; +static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -static const u16 BTREE_ITER_INTENT = 1 << 2; +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -static const u16 BTREE_ITER_PREFETCH = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -static const u16 BTREE_ITER_CACHED = 1 << 6; -static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; -static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -#define __BTREE_ITER_FLAGS_END 16 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; +#define __BTREE_ITER_FLAGS_END 15 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -238,6 +237,8 @@ struct btree_path { u8 sorted_idx; u8 ref; u8 intent_ref; + u32 alloc_seq; + u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -320,31 +321,6 @@ struct btree_iter { #endif }; -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - -struct btree_key_cache { - struct mutex lock; - struct rhashtable table; - bool table_init_done; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct shrinker shrink; - unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - - atomic_long_t nr_freed; - atomic_long_t nr_keys; - atomic_long_t nr_dirty; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 @@ -360,7 +336,6 @@ struct bkey_cached { struct rhash_head hash; struct list_head list; - struct journal_preres res; struct journal_entry_pin journal; u64 seq; @@ -390,7 +365,6 @@ struct btree_insert_entry { u8 old_btree_u64s; struct bkey_i *k; struct btree_path *path; - u64 seq; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; @@ -430,6 +404,7 @@ struct btree_trans { u8 nr_updates; u8 nr_wb_updates; u8 wb_updates_size; + bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -438,6 +413,7 @@ struct btree_trans { bool journal_replay_not_finished:1; bool is_initial_gc:1; bool notrace_relock_fail:1; + bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_begin_ip; @@ -459,8 +435,8 @@ struct btree_trans { void *mem; u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path *paths; - struct btree_insert_entry *updates; + struct btree_path paths[BTREE_ITER_MAX]; + struct btree_insert_entry updates[BTREE_ITER_MAX]; struct btree_write_buffered_key *wb_updates; /* update path: */ @@ -469,11 +445,9 @@ struct btree_trans { struct journal_entry_pin *journal_pin; struct journal_res journal_res; - struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; - unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; @@ -643,16 +617,17 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, + BKEY_TYPE_btree, +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, BCH_BTREE_IDS() #undef x - BKEY_TYPE_btree, + BKEY_TYPE_NR }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_btree : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (unsigned) id + 1; } /* Type of keys @b contains: */ @@ -661,19 +636,21 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } +const char *bch2_btree_node_type_str(enum btree_node_type); + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - (BIT(BKEY_TYPE_extents)| \ - BIT(BKEY_TYPE_alloc)| \ - BIT(BKEY_TYPE_inodes)| \ - BIT(BKEY_TYPE_stripes)| \ - BIT(BKEY_TYPE_reflink)| \ - BIT(BKEY_TYPE_btree)) + (BIT_ULL(BKEY_TYPE_extents)| \ + BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ - (BIT(BKEY_TYPE_alloc)| \ - BIT(BKEY_TYPE_inodes)| \ - BIT(BKEY_TYPE_stripes)| \ - BIT(BKEY_TYPE_snapshots)) + (BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -681,13 +658,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); + return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); } static inline bool btree_node_type_is_extents(enum btree_node_type type) { const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; @@ -697,7 +674,7 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type) static inline bool btree_id_is_extents(enum btree_id btree) { - return btree_node_type_is_extents((enum btree_node_type) btree); + return btree_node_type_is_extents(__btree_node_type(0, btree)); } static inline bool btree_type_has_snapshots(enum btree_id id) @@ -711,6 +688,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1U << id) & mask; } +static inline bool btree_type_has_snapshot_field(enum btree_id id) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; +} + static inline bool btree_type_has_ptrs(enum btree_id id) { const unsigned mask = 0 diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 2281140..14a2315 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -4,7 +4,6 @@ #include "btree_iter.h" #include "journal.h" -#include "journal.h" struct bch_fs; struct btree; @@ -22,50 +21,42 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, struct bkey_i *, u64); -enum btree_insert_flags { +#define BCH_TRANS_COMMIT_FLAGS() \ + x(no_enospc, "don't check for enospc") \ + x(no_check_rw, "don't attempt to take a ref on c->writes") \ + x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ + x(no_journal_res, "don't take a journal reservation, instead " \ + "pin journal entry referred to by trans->journal_res.seq") \ + x(journal_reclaim, "operation required for journal reclaim; may return error" \ + "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + +enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ - __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, - __BTREE_INSERT_NOCHECK_RW, - __BTREE_INSERT_LAZY_RW, - __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RECLAIM, - __BTREE_INSERT_NOWAIT, - __BTREE_INSERT_GC_LOCK_HELD, - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, + __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, +#define x(n, ...) __BCH_TRANS_COMMIT_##n, + BCH_TRANS_COMMIT_FLAGS() +#undef x }; -/* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) - -#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) -#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) - -/* Insert is for journal replay - don't get journal reservations: */ -#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) - -/* Insert is being called from journal reclaim path: */ -#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) - -/* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) -#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) - -#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) +enum bch_trans_commit_flags { +#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), + BCH_TRANS_COMMIT_FLAGS() +#undef x +}; int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); +int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_update_flags); -int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *, +int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_update_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, u64 *, int flags); + struct disk_reservation *, int flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); @@ -114,8 +105,8 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); -int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -int bch2_journal_log_msg(struct bch_fs *, const char *, ...); +__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); /** * bch2_trans_commit - insert keys at given iterator positions @@ -145,30 +136,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ -({ \ - struct btree_trans trans; \ - int _ret; \ - \ - bch2_trans_init(&trans, (_c), 0, 0); \ - _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ - bch2_trans_exit(&trans); \ - \ - _ret; \ -}) - #define bch2_trans_run(_c, _do) \ ({ \ - struct btree_trans trans; \ - int _ret; \ - \ - bch2_trans_init(&trans, (_c), 0, 0); \ - _ret = (_do); \ - bch2_trans_exit(&trans); \ - \ + struct btree_trans *trans = bch2_trans_get(_c); \ + int _ret = (_do); \ + bch2_trans_put(trans); \ _ret; \ }) +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ + bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) + #define trans_for_each_update(_trans, _i) \ for ((_i) = (_trans)->updates; \ (_i) < (_trans)->updates + (_trans)->nr_updates; \ @@ -268,10 +246,10 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags|BTREE_ITER_INTENT, type); - struct bkey_i *ret = unlikely(IS_ERR(k.k)) + struct bkey_i *ret = IS_ERR(k.k) ? ERR_CAST(k.k) : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); - if (unlikely(IS_ERR(ret))) + if (IS_ERR(ret)) bch2_trans_iter_exit(trans, iter); return ret; } diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index f42ef46..18e5a75 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -5,6 +5,7 @@ #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -17,7 +18,6 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" -#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -143,10 +143,15 @@ static size_t btree_node_u64s_with_format(struct btree *b, } /** - * btree_node_format_fits - check if we could rewrite node with a new format + * bch2_btree_node_format_fits - check if we could rewrite node with a new format * - * This assumes all keys can pack with the new format -- it just checks if - * the re-packed keys would fit inside the node itself. + * @c: filesystem handle + * @b: btree node to rewrite + * @new_f: bkey format to translate keys to + * + * Returns: true if all re-packed keys will be able to fit in a new node. + * + * Assumes all keys will successfully pack with the new format. */ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, struct bkey_format *new_f) @@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct write_point *wp; struct btree *b; BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets ob = { .nr = 0 }; + struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim @@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - ob = a->ob; + obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); goto mem_alloc; @@ -292,7 +297,7 @@ retry: bkey_btree_ptr_v2_init(&tmp.k); bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); - bch2_open_bucket_get(c, wp, &ob); + bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(trans, interior_node); @@ -304,7 +309,7 @@ mem_alloc: BUG_ON(b->ob.nr); bkey_copy(&b->key, &tmp.k); - b->ob = ob; + b->ob = obs; return b; } @@ -470,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: - * - * BTREE_INSERT_NOWAIT only applies to btree node allocation, not - * blocking on this lock: */ ret = bch2_btree_cache_cannibalize_lock(c, cl); if (ret) @@ -482,9 +484,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, - flags & BTREE_INSERT_NOWAIT ? NULL : cl, - interior, flags); + b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, + interior, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -508,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * up_read(&c->gc_lock); as->took_gc_lock = false; - bch2_journal_preres_put(&c->journal, &as->journal_preres); - bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -592,12 +591,11 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); u64 journal_seq = 0; unsigned i; int ret; - bch2_trans_init(&trans, c, 0, 512); /* * If we're already in an error state, it might be because a btree node * was never written, and we might be trying to free that same btree @@ -618,7 +616,7 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); @@ -640,13 +638,13 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - ret = commit_do(&trans, &as->disk_res, &journal_seq, + ret = commit_do(trans, &as->disk_res, &journal_seq, BCH_WATERMARK_reclaim| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM, - btree_update_nodes_written_trans(&trans, as)); - bch2_trans_unlock(&trans); + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_journal_reclaim, + btree_update_nodes_written_trans(trans, as)); + bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s(): error %s", __func__, bch2_err_str(ret)); @@ -655,7 +653,7 @@ err: struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); + path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); /* * @b is the node we did the final insert into: * @@ -678,13 +676,13 @@ err: * we may rarely end up with a locked path besides the one we * have here: */ - bch2_trans_unlock(&trans); - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); - mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); + bch2_trans_unlock(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); path->l[b->c.level].b = b; - bch2_btree_node_lock_write_nofail(&trans, path, &b->c); + bch2_btree_node_lock_write_nofail(trans, path, &b->c); mutex_lock(&c->btree_interior_update_lock); @@ -697,15 +695,15 @@ err: * btree_interior_update_lock: */ if (as->b == b) { - struct bset *i = btree_bset_last(b); - BUG_ON(!b->c.level); BUG_ON(!btree_node_dirty(b)); if (!ret) { - i->journal_seq = cpu_to_le64( + struct bset *last = btree_bset_last(b); + + last->journal_seq = cpu_to_le64( max(journal_seq, - le64_to_cpu(i->journal_seq))); + le64_to_cpu(last->journal_seq))); bch2_btree_add_journal_pin(c, b, journal_seq); } else { @@ -724,14 +722,12 @@ err: six_unlock_write(&b->c.lock); btree_node_write_if_need(c, b, SIX_LOCK_intent); - btree_node_unlock(&trans, path, b->c.level); - bch2_path_put(&trans, path, true); + btree_node_unlock(trans, path, b->c.level); + bch2_path_put(trans, path, true); } bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_preres_put(&c->journal, &as->journal_preres); - mutex_lock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -745,7 +741,7 @@ err: for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -753,8 +749,8 @@ err: for (i = 0; i < as->nr_open_buckets; i++) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - bch2_btree_update_free(as, &trans); - bch2_trans_exit(&trans); + bch2_btree_update_free(as, trans); + bch2_trans_put(trans); } static void btree_interior_update_work(struct work_struct *work) @@ -814,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } +static int bch2_update_reparent_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + static void btree_update_reparent(struct btree_update *as, struct btree_update *child) { @@ -824,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, + bch2_update_reparent_journal_pin_flush); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -933,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b b->ob.v[--b->ob.nr]; } +static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_updates - redirect @b's @@ -984,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, * when the new nodes are persistent and reachable on disk: */ w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1038,12 +1049,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct bch_fs *c = trans->c; struct btree_update *as; u64 start_time = local_clock(); - int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; @@ -1057,10 +1067,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - journal_flags |= watermark; - while (1) { nr_nodes[!!update_level] += 1 + split; update_level++; @@ -1083,9 +1089,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) { + if (!down_read_trylock(&c->gc_lock)) { ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); if (ret) { up_read(&c->gc_lock); @@ -1099,7 +1103,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; INIT_LIST_HEAD(&as->list); @@ -1125,27 +1129,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret) { - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - goto err; - } - - ret = drop_locks_do(trans, - bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags)); - if (ret == -BCH_ERR_journal_preres_get_blocked) { - trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - } - if (ret) - goto err; - } - ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, @@ -1163,7 +1146,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * flag */ if (bch2_err_matches(ret, ENOSPC) && - (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; @@ -1216,18 +1199,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -/** - * bch_btree_set_root - update the root in memory and on disk - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. However, you must hold an intent lock on the - * old root. - * - * Note: This allocates a journal entry but doesn't add any keys to - * it. All the btree roots are part of every journal write, so there - * is nothing new to be done. This just guarantees that there is a - * journal write. - */ static void bch2_btree_set_root(struct btree_update *as, struct btree_trans *trans, struct btree_path *path, @@ -1282,14 +1253,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { printbuf_reset(&buf); prt_printf(&buf, "inserting invalid bkey\n "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_printf(&buf, "\n "); bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); bch2_fs_inconsistent(c, "%s", buf.buf); dump_stack(); @@ -1341,12 +1312,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, ; while (!bch2_keylist_empty(keys)) { - struct bkey_i *k = bch2_keylist_front(keys); + insert = bch2_keylist_front(keys); - if (bpos_gt(k->k.p, b->key.k.p)) + if (bpos_gt(insert->k.p, b->key.k.p)) break; - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k); + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); bch2_keylist_pop_front(keys); } } @@ -1513,12 +1484,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path1, n1); path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path2, n2); /* @@ -1539,7 +1510,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path2->locks_want++; BUG_ON(btree_node_locked(path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path2, n3); n3->sib_u64s[0] = U16_MAX; @@ -1563,7 +1534,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path1, n1); if (parent) @@ -1661,12 +1632,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as, } /** - * bch_btree_insert_node - insert bkeys into a given btree node + * bch2_btree_insert_node - insert bkeys into a given btree node * - * @iter: btree iterator + * @as: btree_update object + * @trans: btree_trans object + * @path: path that points to current node + * @b: node to insert keys into * @keys: list of keys to insert - * @hook: insert callback - * @persistent: if not null, @persistent will wait on journal write + * @flags: transaction commit flags + * + * Returns: 0 on success, typically transaction restart error on failure * * Inserts as many keys as it can into a given btree node, splitting it if full. * If a split occurred, this function will return early. This can only happen @@ -1859,7 +1834,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(path, b); as = bch2_btree_update_start(trans, path, level, false, - BTREE_INSERT_NOFAIL|flags); + BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -1890,7 +1865,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, new_path, n); bkey_init(&delete.k); @@ -1934,9 +1909,6 @@ err_free_update: goto out; } -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - */ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -1948,7 +1920,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_update *as; int ret; - flags |= BTREE_INSERT_NOFAIL; + flags |= BCH_TRANS_COMMIT_no_enospc; parent = btree_node_parent(iter->path, b); as = bch2_btree_update_start(trans, iter->path, b->c.level, @@ -1967,7 +1939,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, new_path, n); trace_and_count(c, btree_node_rewrite, c, b); @@ -1994,7 +1966,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, out: if (new_path) bch2_path_put(trans, new_path, true); - bch2_btree_path_downgrade(trans, iter->path); + bch2_trans_downgrade(trans); return ret; err: bch2_btree_node_free_never_used(as, trans, n); @@ -2055,9 +2027,9 @@ static void async_btree_node_rewrite_work(struct work_struct *work) int ret; ret = bch2_trans_do(c, NULL, NULL, 0, - async_btree_node_rewrite_trans(&trans, a)); + async_btree_node_rewrite_trans(trans, a)); if (ret) - bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2096,8 +2068,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) ret = bch2_fs_read_write_early(c); if (ret) { - bch_err(c, "%s: error going read-write: %s", - __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "going read-write"); kfree(a); return; } @@ -2372,7 +2343,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) { - bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2385,7 +2356,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) as, as->mode, as->nodes_written, - atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, + closure_nr_remaining(&as->cl), as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); } @@ -2419,30 +2390,24 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry r->level = entry->level; r->alive = true; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); mutex_unlock(&c->btree_root_lock); } struct jset_entry * bch2_btree_roots_to_journal_entries(struct bch_fs *c, - struct jset_entry *start, - struct jset_entry *end) + struct jset_entry *end, + unsigned long skip) { - struct jset_entry *entry; - unsigned long have = 0; unsigned i; - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root) - __set_bit(entry->btree_id, &have); - mutex_lock(&c->btree_root_lock); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (r->alive && !test_bit(i, &have)) { + if (r->alive && !test_bit(i, &skip)) { journal_entry_set(end, BCH_JSET_ENTRY_btree_root, i, r->level, &r->key, r->key.k.u64s); end = vstruct_next(end); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 5e0a467..031076e 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -55,7 +55,6 @@ struct btree_update { unsigned update_level; struct disk_reservation disk_res; - struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: @@ -271,7 +270,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); + __bch_btree_u64s_remaining(c, b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -303,7 +302,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(c, b), &k); } /* @@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *); void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, struct jset_entry *); + struct jset_entry *, unsigned long); void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c deleted file mode 100644 index 369e37a..0000000 --- a/libbcachefs/btree_update_leaf.c +++ /dev/null @@ -1,2107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "extent_update.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "recovery.h" -#include "subvolume.h" -#include "replicas.h" -#include "trace.h" - -#include -#include - -/* - * bch2_btree_path_peek_slot() for a cached iterator might return a key in a - * different snapshot: - */ -static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -{ - struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); - - if (k.k && bpos_eq(path->pos, k.k->p)) - return k; - - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); - - if (j_k) - k = bkey_i_to_s_c(j_k); - } - - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); -#endif -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, - struct bkey_i *, enum btree_update_flags, - unsigned long ip); - -static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) -{ - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -} - -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) -{ - return i->path->l + i->level; -} - -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; -} - -static inline bool same_leaf_as_next(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; -} - -inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - if (unlikely(btree_node_just_written(b)) && - bch2_btree_post_write_cleanup(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(trans, b); -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bkey_packed *k; - unsigned clobber_u64s = 0, new_u64s = 0; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) - k = NULL; - - /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_deleted(k)); - - /* Deleting, but not found? nothing to do: */ - if (bkey_deleted(&insert->k) && !k) - return false; - - if (bkey_deleted(&insert->k)) { - /* Deleting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - goto fix_iter; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - - return true; - } - - if (k) { - /* Overwriting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - insert->k.needs_whiteout = k->needs_whiteout; - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - goto overwrite; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - } - - k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - new_u64s = k->u64s; -fix_iter: - if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(trans, path, b, node_iter, k, - clobber_u64s, new_u64s); - return true; -} - -static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans trans; - unsigned long old, new, v; - unsigned idx = w - b->writes; - - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); - v = READ_ONCE(b->flags); - - do { - old = new = v; - - if (!(old & (1 << BTREE_NODE_dirty)) || - !!(old & (1 << BTREE_NODE_write_idx)) != idx || - w->journal.seq != seq) - break; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_journal_reclaim; - new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); - - btree_node_write_if_need(c, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_exit(&trans); - return 0; -} - -int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -inline void bch2_btree_add_journal_pin(struct bch_fs *c, - struct btree *b, u64 seq) -{ - struct btree_write *w = btree_current_write(b); - - bch2_journal_pin_add(&c->journal, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? bch2_btree_node_flush0 - : bch2_btree_node_flush1); -} - -/** - * btree_insert_key - insert a key one key into a leaf node - */ -inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert, - u64 journal_seq) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(path)->b; - struct bset_tree *t = bset_tree_last(b); - struct bset *i = bset(b, t); - int old_u64s = bset_u64s(t); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, - &path_l(path)->iter, insert))) - return; - - i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - - if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - set_btree_node_dirty_acct(c, b); - } - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) bset_u64s(t) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); -} - -/* Cached btree updates: */ - -/* Normal update interface: */ - -static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); - EBUG_ON(!i->level && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && - i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); -} - -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - -static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) -{ - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); -} - -#define JSET_ENTRY_LOG_U64s 4 - -static noinline void journal_transaction_name(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct jset_entry *entry = - bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_log, 0, 0, - JSET_ENTRY_LOG_U64s); - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); -} - -static inline int btree_key_can_insert(struct btree_trans *trans, - struct btree *b, unsigned u64s) -{ - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; - - return 0; -} - -static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned u64s) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; - struct btree_insert_entry *i; - unsigned new_u64s; - struct bkey_i *new_k; - - EBUG_ON(path->level); - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) - return -BCH_ERR_btree_insert_need_journal_reclaim; - - /* - * bch2_varint_decode can read past the end of the buffer by at most 7 - * bytes (it won't be used): - */ - u64s += 1; - - if (u64s <= ck->u64s) - return 0; - - new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[path->btree_id], new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -/* Triggers: */ - -static int run_one_mem_trigger(struct btree_trans *trans, - struct btree_insert_entry *i, - unsigned flags) -{ - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - int ret; - - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) - return 0; - - if (old_ops->atomic_trigger == new_ops->atomic_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key(trans, i->btree_id, i->level, - old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - - _deleted.p = i->path->pos; - - ret = bch2_mark_key(trans, i->btree_id, i->level, - deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, i->btree_id, i->level, - old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } - - return ret; -} - -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) -{ - /* - * Transactional triggers create new btree_insert_entries, so we can't - * pass them a pointer to a btree_insert_entry, that memory is going to - * move: - */ - struct bkey old_k = i->old_k; - struct bkey_s_c old = { &old_k, i->old_v }; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; - - if (!i->insert_trigger_run && - !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - i->overwrite_trigger_run = true; - i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE| - i->flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; - } else { - return 0; - } -} - -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) -{ - struct btree_insert_entry *i; - bool trans_trigger_run; - int ret, overwrite; - - for (overwrite = 1; overwrite >= 0; --overwrite) { - - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->btree_id != btree_id) - continue; - - ret = run_one_trans_trigger(trans, i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - } - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; - unsigned btree_id = 0; - int ret = 0; - - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); - if (ret) - return ret; - } - - trans_for_each_update(trans, i) { - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); - if (ret) - return ret; - break; - } - } - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); -#endif - return 0; -} - -static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - - trans_for_each_update(trans, i) { - /* - * XXX: synchronization of cached update triggers with gc - * XXX: synchronization of interior node updates with gc - */ - BUG_ON(i->cached || i->level); - - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); - if (ret) - break; - } - } - - return ret; -} - -static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - struct btree_trans_commit_hook *h; - unsigned u64s = 0; - bool marking = false; - int ret; - - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); - } - - /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: - */ - - prefetch(&trans->c->journal.flags); - - trans_for_each_update(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); - if (ret) { - *stopped_at = i; - return ret; - } - - if (btree_node_type_needs_gc(i->bkey_type)) - marking = true; - } - - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; - - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - ret = bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_NONBLOCK); - if (ret) - return ret; - - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - } else { - trans->journal_res.seq = c->journal.replay_journal_seq; - } - - /* - * Not allowed to fail after we've gotten our journal reservation - we - * have to use it: - */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (bch2_journal_seq_verify) - trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) - trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; - } - - if (trans->fs_usage_deltas && - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return -BCH_ERR_btree_insert_need_mark_replicas; - - if (trans->nr_wb_updates) { - EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - goto revert_fs_usage; - h = h->next; - } - - trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); - if (ret) - goto fatal_err; - } - - if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; - } - - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - struct journal *j = &c->journal; - struct jset_entry *entry; - - trans_for_each_update(trans, i) { - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_NOJOURNAL) - continue; - - verify_update_old_key(trans, i); - - if (trans->journal_transaction_names) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble(&entry->start[0], - (struct bkey_s_c) { &i->old_k, i->old_v }); - } - - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - i->btree_id, i->level, - i->k->k.u64s); - bkey_copy(&entry->start[0], i->k); - } - - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy(&entry->start[0], &wb->k); - } - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; - } - - trans_for_each_update(trans, i) { - i->k->k.needs_whiteout = false; - - if (!i->cached) { - u64 seq = trans->journal_res.seq; - - if (i->flags & BTREE_UPDATE_PREJOURNAL) - seq = i->seq; - - bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); - } else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i); - else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); - } - } - - return 0; -fatal_err: - bch2_fatal_error(c); -revert_fs_usage: - if (trans->fs_usage_deltas) - bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); - return ret; -} - -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - -static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - - trans_for_each_update(trans, i) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); -} - -#ifdef CONFIG_BCACHEFS_DEBUG -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", - trans->fn, (void *) i->ip_allocated); - prt_newline(err); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - printbuf_exit(err); - - return -EINVAL; -} -#endif - -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0, u64s_delta = 0; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) - return ret; - } -#endif - - trans_for_each_update(trans, i) { - if (i->cached) - continue; - - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= i->old_btree_u64s; - - if (!same_leaf_as_next(trans, i)) { - if (u64s_delta <= 0) { - ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, flags); - if (unlikely(ret)) - return ret; - } - - u64s_delta = 0; - } - } - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - - ret = trans_lock_write(trans); - if (unlikely(ret)) - return ret; - - ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, NULL); - - /* - * Drop journal reservation after dropping write locks, since dropping - * the journal reservation may kick off a journal write: - */ - bch2_journal_res_put(&c->journal, &trans->journal_res); - - if (unlikely(ret)) - return ret; - - bch2_trans_downgrade(trans); - - return 0; -} - -static int journal_reclaim_wait_done(struct bch_fs *c) -{ - int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); - - if (!ret) - journal_reclaim_kick(&c->journal); - return ret; -} - -static noinline -int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - int ret, unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); - break; - case -BCH_ERR_journal_res_get_blocked: - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); - - trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); - - wait_event_freezable(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); - if (ret < 0) - break; - - ret = bch2_trans_relock(trans); - break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } - default: - BUG_ON(ret >= 0); - break; - } - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(flags & BTREE_INSERT_NOWAIT) && - (flags & BTREE_INSERT_NOFAIL), c, - "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); - - return ret; -} - -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || - test_bit(BCH_FS_STARTED, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - -/* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to - * do. - */ -static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - - trans_for_each_update(trans, i) { - ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) - break; - } - - return ret; -} - -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; - unsigned u64s; - int ret = 0; - - if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) - goto out_reset; - - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out_reset; - - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - - if (!(flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; - } - - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - - if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); - - trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); - - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); - if (unlikely(ret)) - goto out; - - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); - - if (i->key_cache_already_flushed) - continue; - - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - - if (i->flags & BTREE_UPDATE_NOJOURNAL) - continue; - - trans->journal_u64s += u64s; - - /* and we're also going to log the overwrite: */ - if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(i->old_k.u64s); - } - - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { - ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, - (flags & BTREE_INSERT_NOFAIL) - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto err; - } -retry: - bch2_trans_verify_not_in_restart(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - - ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); - - /* make sure we didn't drop or screw up locks: */ - bch2_trans_verify_locks(trans); - - if (ret) - goto err; - - trace_and_count(c, transaction_commit, trans, _RET_IP_); -out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); -out_reset: - bch2_trans_reset_updates(trans); - - return ret; -err: - ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); - if (ret) - goto out; - - goto retry; -} - -static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) - return 0; - - return __check_pos_snapshot_overwritten(trans, id, pos); -} - -static noinline int extent_front_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bkey_i **insert, - enum btree_update_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *update; - int ret; - - update = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - return ret; - - if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) - return 0; - - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); - if (ret < 0) - return ret; - if (ret) - return 0; - - ret = bch2_btree_delete_at(trans, iter, flags); - if (ret) - return ret; - - *insert = update; - return 0; -} - -static noinline int extent_back_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret; - - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); - if (ret < 0) - return ret; - if (ret) - return 0; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); - return 0; -} - -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = { NULL }; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; - int ret; - - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; - - darray_init(&s); - - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while ((old_k = bch2_btree_iter_prev(&old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bkey_err(new_k); - if (ret) - break; - - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - break; - - bkey_init(&update->k); - update->k.p = whiteout_pos; - update->k.type = KEY_TYPE_whiteout; - - ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } - bch2_trans_iter_exit(trans, &new_iter); - - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); - - return ret; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_update_flags flags, - struct bkey_s_c old, - struct bkey_s_c new) -{ - enum btree_id btree_id = iter->btree_id; - struct bkey_i *update; - struct bpos new_start = bkey_start_pos(new.k); - bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); - bool back_split = bkey_gt(old.k->p, new.k->p); - int ret = 0, compressed_sectors; - - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (((front_split && back_split) || - ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && - (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors; - - if (front_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_back(new_start, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - /* If we're overwriting in a different snapshot - middle split: */ - if (old.k->p.snapshot != new.k->p.snapshot && - (front_split || back_split)) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new_start, update); - bch2_cut_back(new.k->p, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - if (bkey_le(old.k->p, new.k->p)) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bkey_init(&update->k); - update->k.p = old.k->p; - update->k.p.snapshot = new.k->p.snapshot; - - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); - if (ret < 0) - return ret; - if (ret) - update->k.type = KEY_TYPE_whiteout; - } - - ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - if (back_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new.k->p, update); - - ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags, _RET_IP_); - if (ret) - return ret; - } - - return 0; -} - -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_update_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - - if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - ret = extent_front_merge(trans, &iter, k, &insert, flags); - if (ret) - goto err; - } - - goto next; - } - - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); - - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - goto err; - - if (done) - goto out; -next: - bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - } - - if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = extent_back_merge(trans, &iter, insert, k); - if (ret) - goto err; - } -out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, - struct btree_insert_entry *i, - enum btree_update_flags flags, - unsigned long ip) -{ - struct btree_path *btree_path; - struct bkey k; - int ret; - - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); - if (ret) - goto out; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; - - btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); -out: - bch2_path_put(trans, btree_path, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags, - unsigned long ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i, n; - u64 seq = 0; - int cmp; - - EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - EBUG_ON(!bpos_eq(k->k.p, path->pos)); - - /* - * The transaction journal res hasn't been allocated at this point. - * That occurs at commit time. Reuse the seq field to pass in the seq - * of a prejournaled key. - */ - if (flags & BTREE_UPDATE_PREJOURNAL) - seq = trans->journal_res.seq; - - n = (struct btree_insert_entry) { - .flags = flags, - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, - .cached = path->cached, - .path = path, - .k = k, - .seq = seq, - .ip_allocated = ip, - }; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif - - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - trans_for_each_update(trans, i) { - cmp = btree_insert_entry_cmp(&n, i); - if (cmp <= 0) - break; - } - - if (!cmp && i < trans->updates + trans->nr_updates) { - EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - - bch2_path_put(trans, i->path, true); - i->flags = n.flags; - i->cached = n.cached; - i->k = n.k; - i->path = n.path; - i->seq = n.seq; - i->ip_allocated = n.ip_allocated; - } else { - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - - i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; - i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } - - __btree_path_get(i->path, true); - - /* - * If a key is present in the key cache, it must also exist in the - * btree - this is necessary for cache coherency. When iterating over - * a btree that's cached in the key cache, the btree iter code checks - * the key cache - but the key has to exist in the btree for that to - * work: - */ - if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); - - return 0; -} - -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) -{ - struct btree_path *path = iter->update_path ?: iter->path; - struct bkey_cached *ck; - int ret; - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return bch2_trans_update_extent(trans, iter, k, flags); - - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - k->k.type = KEY_TYPE_whiteout; - } - - /* - * Ensure that updates to cached btrees go to the key cache: - */ - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - !path->cached && - !path->level && - btree_id_cached(trans->c, path->btree_id)) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, k->k.p)) { - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); - if (unlikely(ret)) - return ret; - - ck = (void *) iter->key_cache_path->l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(iter->key_cache_path); - } - - path = iter->key_cache_path; - } - - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); -} - -/* - * Add a transaction update for a key that has already been journaled. - */ -int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, - struct btree_iter *iter, struct bkey_i *k, - enum btree_update_flags flags) -{ - trans->journal_res.seq = seq; - return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| - BTREE_UPDATE_PREJOURNAL); -} - -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } - - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; - - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } - - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; - } - - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; - - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; - - return 0; -} - -int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) -{ - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); - k = bch2_btree_iter_prev(iter); - ret = bkey_err(k); - if (ret) - goto err; - - bch2_btree_iter_advance(iter); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - BUG_ON(k.k->type != KEY_TYPE_deleted); - - if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; - goto err; - } - - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -void bch2_trans_commit_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - h->next = trans->hooks; - trans->hooks = h; -} - -int bch2_btree_insert_nonextent(struct btree_trans *trans, - enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/** - * bch2_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @insert_keys: list of keys to insert - * @hook: insert callback - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 *journal_seq, int flags) -{ - return bch2_trans_do(c, disk_res, journal_seq, flags, - __bch2_btree_insert(&trans, id, k, 0)); -} - -int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, - unsigned len, unsigned update_flags) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = iter->pos; - bch2_key_resize(&k->k, len); - return bch2_trans_update(trans, iter, k, update_flags); -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -} - -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - u32 restart_count = trans->restart_count; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); - while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto err; - - bkey_init(&delete.k); - - /* - * This could probably be more efficient for extents: - */ - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_IS_EXTENTS) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: - bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(trans->c, &disk_res); -err: - /* - * the bch2_trans_begin() call is in a weird place because we - * need to call it after every transaction commit, to avoid path - * overflow, but don't want to call it if the delete operation - * is a no-op and we have no work to do: - */ - bch2_trans_begin(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(&trans, id, start, end, - update_flags, journal_seq)); - if (ret == -BCH_ERR_transaction_restart_nested) - ret = 0; - return ret; -} - -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct bkey_i *k; - int ret = 0; - - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; - - return bch2_trans_update_buffered(trans, btree, k); -} - -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) -{ - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); - if (ret) - goto err; - - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; -} - -static int -__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - va_list args) -{ - int ret; - - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); - } else { - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); - } - - return ret; -} - -int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, 0, fmt, args); - va_end(args); - return ret; -} - -/* - * Use for logging messages during recovery to enable reserved space and avoid - * blocking. - */ -int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); - va_end(args); - return ret; -} diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 5f96db5..a6bf6ed 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -11,6 +11,9 @@ #include +static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + static int btree_write_buffered_key_cmp(const void *_l, const void *_r) { const struct btree_write_buffered_key *l = _l; @@ -45,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, if (ret) return ret; + /* + * We can't clone a path that has write locks: unshare it now, before + * set_pos and traverse(): + */ + if (iter->path->ref > 1) + iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); + path = iter->path; if (!*write_locked) { @@ -64,23 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; - - if (path->ref > 1) { - /* - * We can't clone a path that has write locks: if the path is - * shared, unlock before set_pos(), traverse(): - */ - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - *write_locked = false; - } return 0; trans_commit: - return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?: + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, commit_flags| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM); + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); } static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) @@ -123,8 +128,11 @@ btree_write_buffered_insert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_CACHED|BTREE_ITER_INTENT); + trans->journal_res.seq = wb->journal_seq; + ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0); + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -148,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f if (!locked && !mutex_trylock(&wb->flush_lock)) return 0; - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); bch2_journal_pin_drop(j, &wb->journal_pin); s = btree_write_buffer_switch(wb); @@ -166,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't - * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert @@ -193,7 +202,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f if (!iter.path || iter.path->btree_id != i->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } bch2_btree_iter_set_pos(&iter, i->k.k.p); @@ -249,21 +259,14 @@ slowpath: if (!i->journal_seq) continue; - if (i->journal_seq > pin.seq) { - struct journal_entry_pin pin2; - - memset(&pin2, 0, sizeof(pin2)); - - bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); - bch2_journal_pin_drop(j, &pin); - bch2_journal_pin_copy(j, &pin, &pin2, NULL); - bch2_journal_pin_drop(j, &pin2); - } + bch2_journal_pin_update(j, i->journal_seq, &pin, + bch2_btree_write_buffer_journal_flush); ret = commit_do(trans, NULL, NULL, commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, btree_write_buffered_insert(trans, i)); if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) break; @@ -293,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, mutex_lock(&wb->flush_lock); return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); + __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true)); } static inline u64 btree_write_buffer_ref(int idx) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 7bb7f0c..58d8c6f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -367,12 +367,11 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, struct printbuf buf = PRINTBUF; percpu_down_read(&c->mark_lock); - buf.atomic++; idx = bch2_replicas_entry_idx(c, r); if (idx < 0 && - fsck_err(c, "no replicas entry\n" - " while marking %s", + fsck_err(c, ptr_to_missing_replicas_entry, + "no replicas entry\n while marking %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { percpu_up_read(&c->mark_lock); ret = bch2_mark_replicas(c, r); @@ -474,8 +473,9 @@ static inline int update_replicas_list(struct btree_trans *trans, d = trans->fs_usage_deltas; n = (void *) d->d + d->used; n->delta = sectors; - memcpy((void *) n + offsetof(struct replicas_delta, r), - r, replicas_entry_bytes(r)); + unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), + r, replicas_entry_bytes(r), + "flexible array member embedded in strcuct with padding"); bch2_replicas_entry_sort(&n->r); d->used += b; return 0; @@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - u16 bucket_sectors = !ptr->cached + u32 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; struct printbuf buf = PRINTBUF; @@ -695,6 +695,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -707,6 +708,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -720,6 +722,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (b_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -741,6 +744,7 @@ static int check_bucket_ref(struct btree_trans *trans, ptr_data_type && bucket_data_type != ptr_data_type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -752,9 +756,10 @@ static int check_bucket_ref(struct btree_trans *trans, goto err; } - if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { + if ((u64) bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + BCH_FSCK_ERR_bucket_sector_count_overflow, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], @@ -795,7 +800,6 @@ static int mark_stripe_bucket(struct btree_trans *trans, /* * XXX doesn't handle deletion */ percpu_down_read(&c->mark_lock); - buf.atomic++; g = PTR_GC_BUCKET(ca, ptr); if (g->dirty_sectors || @@ -936,14 +940,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, return 0; } -int bch2_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1019,6 +1021,14 @@ int bch2_mark_extent(struct btree_trans *trans, return 0; } +int bch2_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); +} + int bch2_mark_stripe(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -1125,13 +1135,11 @@ int bch2_mark_stripe(struct btree_trans *trans, return 0; } -int bch2_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bch_fs_usage *fs_usage; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; @@ -1158,6 +1166,14 @@ int bch2_mark_reservation(struct btree_trans *trans, return 0; } +int bch2_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); +} + static s64 __bch2_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 start, u64 end, @@ -1184,7 +1200,8 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans, *idx = r->offset; return 0; not_found: - if (fsck_err(c, "pointer to missing indirect extent\n" + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" " %s\n" " missing range %llu-%llu", (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), @@ -1201,7 +1218,7 @@ not_found: new->k.p = bkey_start_pos(p.k); new->k.p.offset += *idx - start; bch2_key_resize(&new->k, next_idx - *idx); - ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i, + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_NORUN); } @@ -1212,13 +1229,11 @@ fsck_err: return ret; } -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; @@ -1252,6 +1267,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans, return ret; } +int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); +} + void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1299,8 +1322,8 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, struct bch_fs *c = trans->c; static int warned_disk_usage = 0; bool warn = false; - unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d = deltas->d, *d2; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d, *d2; struct replicas_delta *top = (void *) deltas->d + deltas->used; struct bch_fs_usage *dst; s64 added = 0, should_not_have_added; @@ -1358,7 +1381,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) bch2_trans_inconsistent(trans, - "disk usage increased %lli more than %u sectors reserved)", + "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); return 0; need_mark: @@ -1453,15 +1476,11 @@ err: return ret; } -int bch2_trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +static int __trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1518,6 +1537,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans, return ret; } +int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - + (int) bch2_bkey_needs_rebalance(c, old); + + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + if (ret) + return ret; + } + + return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); +} + static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c_stripe s, unsigned idx, bool deleting) @@ -1671,15 +1708,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -int bch2_trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +static int __trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; @@ -1701,7 +1733,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, +int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); +} + +static int trans_mark_reflink_p_segment(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 *idx, unsigned flags) { @@ -1768,35 +1809,38 @@ err: return ret; } -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +static int __trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx, end_idx; int ret = 0; - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - - v->front_pad = v->back_pad = 0; - } - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); end_idx = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); while (idx < end_idx && !ret) - ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); - + ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); return ret; } +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); +} + static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, @@ -1819,6 +1863,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (a->v.data_type && type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, @@ -1826,16 +1871,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_types[type], bch2_data_types[type]); ret = -EIO; - goto out; + goto err; } - a->v.data_type = type; - a->v.dirty_sectors = sectors; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto out; -out: + if (a->v.data_type != type || + a->v.dirty_sectors != sectors) { + a->v.data_type = type; + a->v.dirty_sectors = sectors; + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } +err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1923,12 +1968,29 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); + if (ret) bch_err_fn(c, ret); return ret; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + int ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; + } + } + + return 0; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a418f66..21f6cb3 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -10,21 +10,75 @@ #include "buckets_types.h" #include "extents.h" -#include "super.h" +#include "sb-members.h" + +static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +{ + return div_u64(s, ca->mi.bucket_size); +} + +static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) +{ + return ((sector_t) b) * ca->mi.bucket_size; +} + +static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) +{ + u32 remainder; + + div_u64_rem(s, ca->mi.bucket_size, &remainder); + return remainder; +} + +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} #define for_each_bucket(_b, _buckets) \ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + static inline void bucket_unlock(struct bucket *b) { - smp_store_release(&b->lock, 0); + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); + wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); } static inline void bucket_lock(struct bucket *b) { - while (xchg(&b->lock, 1)) - cpu_relax(); + wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); } static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) @@ -156,7 +210,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma switch (watermark) { case BCH_WATERMARK_NR: - unreachable(); + BUG(); case BCH_WATERMARK_stripe: reserved += ca->mi.nbuckets >> 6; fallthrough; @@ -285,12 +339,48 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +({ \ + int ret = 0; \ + \ + if (_old.k->type) \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + if (!ret && _new.k->type) \ + ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret; \ +}) + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ + mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) + void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, size_t, enum bch_data_type, unsigned); int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_dev_sbs(struct bch_fs *); + +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} /* disk reservations: */ diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index 81ab685..ec1b636 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -133,7 +133,7 @@ retry_rehash: b->t = n; kvfree(t); - pr_debug("took %zu rehashes, table at %zu/%zu elements", + pr_debug("took %zu rehashes, table at %zu/%lu elements", nr_rehashes, nr_elements, 1UL << b->t->bits); out: mutex_unlock(&b->lock); diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index fb603df..4bb88ae 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) devs[i] = strndup_user((const char __user *)(unsigned long) user_devs[i], PATH_MAX); - if (!devs[i]) { - ret = -ENOMEM; + ret= PTR_ERR_OR_ZERO(devs[i]); + if (ret) goto err; - } } c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); @@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; err = bch2_fs_open_incremental(path); kfree(path); @@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) static long bch2_ioctl_query_uuid(struct bch_fs *c, struct bch_ioctl_query_uuid __user *user_arg) { - return copy_to_user(&user_arg->uuid, - &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); + if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid))) + return -EFAULT; + return 0; } #if 0 @@ -188,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; ret = bch2_dev_add(c, path); kfree(path); @@ -230,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; ret = bch2_dev_online(c, path); kfree(path); @@ -329,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.btree_id, - .p.pos = ctx->stats.pos, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), .p.sectors_total = bch2_fs_usage_read_short(c).used, }; @@ -338,7 +341,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (len < sizeof(e)) return -EINVAL; - return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); + if (copy_to_user(buf, &e, sizeof(e))) + return -EFAULT; + + return sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -417,7 +423,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); + arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL); if (!arg) return -ENOMEM; @@ -466,9 +472,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, percpu_up_read(&c->mark_lock); kfree(src); - if (!ret) - ret = copy_to_user(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes); + if (ret) + goto err; + if (copy_to_user(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes)) + ret = -EFAULT; err: kfree(arg); return ret; @@ -513,7 +521,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, percpu_ref_put(&ca->ref); - return copy_to_user(user_arg, &arg, sizeof(arg)); + if (copy_to_user(user_arg, &arg, sizeof(arg))) + return -EFAULT; + + return 0; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -550,8 +561,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, goto err; } - ret = copy_to_user((void __user *)(unsigned long)arg.sb, - sb, vstruct_bytes(sb)); + if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb))) + ret = -EFAULT; err: if (!IS_ERR_OR_NULL(ca)) percpu_ref_put(&ca->ref); @@ -617,6 +629,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, arg.pad) return -EINVAL; + if (arg.nbuckets > U32_MAX) + return -EINVAL; + ca = bch2_device_lookup(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); diff --git a/libbcachefs/chardev.h b/libbcachefs/chardev.h index 3a4890d..0f563ca 100644 --- a/libbcachefs/chardev.h +++ b/libbcachefs/chardev.h @@ -17,7 +17,7 @@ int __init bch2_chardev_init(void); static inline long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user * arg) { - return -ENOSYS; + return -ENOTTY; } static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index a08997a..3c761ad 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, for (i = 0; i < pages; i++) { unsigned offset = offset_in_page(buf); - unsigned pg_len = min(len, PAGE_SIZE - offset); + unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); buf += pg_len; @@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; - if (!chacha20) { - pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); - return PTR_ERR(chacha20); + ret = PTR_ERR_OR_ZERO(chacha20); + if (ret) { + pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); + return ret; } ret = crypto_skcipher_setkey(&chacha20->base, (void *) key, sizeof(*key)); if (ret) { - pr_err("crypto_skcipher_setkey() error: %i", ret); + pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); goto err; } @@ -265,9 +266,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + bch2_checksum_update(&state, p, bv.bv_len); - kunmap_atomic(p); + kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) @@ -287,10 +289,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; crypto_shash_update(desc, p, bv.bv_len); - kunmap_atomic(p); + kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) @@ -360,18 +362,18 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, state.type = type; bch2_checksum_init(&state); - state.seed = a.lo; + state.seed = le64_to_cpu(a.lo); BUG_ON(!bch2_checksum_mergeable(type)); while (b_len) { - unsigned b = min_t(unsigned, b_len, PAGE_SIZE); + unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); bch2_checksum_update(&state, - page_address(ZERO_PAGE(0)), b); - b_len -= b; + page_address(ZERO_PAGE(0)), page_len); + b_len -= page_len; } - a.lo = bch2_checksum_final(&state); + a.lo = cpu_to_le64(bch2_checksum_final(&state)); a.lo ^= b.lo; a.hi ^= b.hi; return a; @@ -394,9 +396,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, unsigned csum_type; struct bch_csum csum; } splits[3] = { - { crc_a, len_a, new_csum_type }, - { crc_b, len_b, new_csum_type }, - { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, + { crc_a, len_a, new_csum_type, { 0 }}, + { crc_b, len_b, new_csum_type, { 0 } }, + { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, }, *i; bool mergeable = crc_old.csum_type == new_csum_type && bch2_checksum_mergeable(new_csum_type); @@ -426,9 +428,10 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, merged = bch2_checksum_bio(c, crc_old.csum_type, extent_nonce(version, crc_old), bio); - if (bch2_crc_cmp(merged, crc_old.csum)) { - bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" + if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { + bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", + __func__, crc_old.csum.hi, crc_old.csum.lo, merged.hi, @@ -458,6 +461,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, return 0; } +/* BCH_SB_FIELD_crypt: */ + +static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + if (BCH_CRYPT_KDF_TYPE(crypt)) { + prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + return 0; +} + +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_newline(out); + prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + prt_newline(out); + prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + prt_newline(out); + prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, +}; + #ifdef __KERNEL__ static int __bch2_request_key(char *key_description, struct bch_key *key) { @@ -489,16 +534,31 @@ static int __bch2_request_key(char *key_description, struct bch_key *key) { key_serial_t key_id; + key_id = request_key("user", key_description, NULL, + KEY_SPEC_SESSION_KEYRING); + if (key_id >= 0) + goto got_key; + key_id = request_key("user", key_description, NULL, KEY_SPEC_USER_KEYRING); - if (key_id < 0) - return -errno; + if (key_id >= 0) + goto got_key; + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_SESSION_KEYRING); + if (key_id >= 0) + goto got_key; + + return -errno; +got_key: if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) return -1; return 0; } + +#include "../crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) @@ -511,9 +571,43 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ret = __bch2_request_key(key_description.buf, key); printbuf_exit(&key_description); + +#ifndef __KERNEL__ + if (ret) { + char *passphrase = read_passphrase("Enter passphrase: "); + struct bch_encrypted_key sb_key; + + bch2_passphrase_check(sb, passphrase, + key, &sb_key); + ret = 0; + } +#endif + + /* stash with memfd, pass memfd fd to mount */ + return ret; } +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *sb) +{ + key_serial_t key_id; + struct printbuf key_description = PRINTBUF; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); + printbuf_exit(&key_description); + if (key_id < 0) + return errno; + + keyctl_revoke(key_id); + + return 0; +} +#endif + int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, struct bch_key *key) @@ -534,7 +628,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, /* decrypt real key: */ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &sb_key, sizeof(sb_key)); + &sb_key, sizeof(sb_key)); if (ret) goto err; @@ -584,7 +678,7 @@ int bch2_disable_encryption(struct bch_fs *c) mutex_lock(&c->sb_lock); - crypt = bch2_sb_get_crypt(c->disk_sb.sb); + crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; @@ -597,7 +691,7 @@ int bch2_disable_encryption(struct bch_fs *c) if (ret) goto out; - crypt->key.magic = BCH_KEY_MAGIC; + crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); crypt->key.key = key; SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); @@ -618,14 +712,14 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) mutex_lock(&c->sb_lock); /* Do we already have an encryption key? */ - if (bch2_sb_get_crypt(c->disk_sb.sb)) + if (bch2_sb_field_get(c->disk_sb.sb, crypt)) goto err; ret = bch2_alloc_ciphers(c); if (ret) goto err; - key.magic = BCH_KEY_MAGIC; + key.magic = cpu_to_le64(BCH_KEY_MAGIC); get_random_bytes(&key.key, sizeof(key.key)); if (keyed) { @@ -646,7 +740,8 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) if (ret) goto err; - crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); + crypt = bch2_sb_field_resize(&c->disk_sb, crypt, + sizeof(*crypt) / sizeof(u64)); if (!crypt) { ret = -BCH_ERR_ENOSPC_sb_crypt; goto err; @@ -687,7 +782,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) goto out; } - crypt = bch2_sb_get_crypt(c->disk_sb.sb); + crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 1ad1d5f..1399838 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -40,14 +40,16 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, */ #define csum_vstruct(_c, _type, _nonce, _i) \ ({ \ - const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ - const void *end = vstruct_end(_i); \ + const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ \ - bch2_checksum(_c, _type, _nonce, start, end - start); \ + bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ }) int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *); +#endif int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); @@ -72,6 +74,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, : 0; } +extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; + int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index c9ca7cc..a8b148e 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -3,7 +3,6 @@ #include "checksum.h" #include "compress.h" #include "extents.h" -#include "io.h" #include "super-io.h" #include @@ -571,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t decompress_workspace_size = 0; - bool decompress_workspace_needed; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); struct { @@ -581,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) size_t decompress_workspace; } compression_types[] = { { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), + 0 }, { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, @@ -620,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!(features & (1 << i->feature))) continue; - if (i->decompress_workspace) - decompress_workspace_needed = true; - if (mempool_initialized(&c->compress_workspace[i->type])) continue; @@ -643,7 +639,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) static u64 compression_opt_to_feature(unsigned v) { unsigned type = bch2_compression_decode(v).type; - return 1ULL << bch2_compression_opt_to_feature[type]; + + return BIT_ULL(bch2_compression_opt_to_feature[type]); } int bch2_fs_compress_init(struct bch_fs *c) @@ -700,14 +697,32 @@ err: return ret; } +void bch2_compression_opt_to_text(struct printbuf *out, u64 v) +{ + struct bch_compression_opt opt = bch2_compression_decode(v); + + if (opt.type < BCH_COMPRESSION_OPT_NR) + prt_str(out, bch2_compression_opts[opt.type]); + else + prt_printf(out, "(unknown compression opt %u)", opt.type); + if (opt.level) + prt_printf(out, ":%u", opt.level); +} + void bch2_opt_compression_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, u64 v) { - struct bch_compression_opt opt = bch2_compression_decode(v); + return bch2_compression_opt_to_text(out, v); +} - prt_str(out, bch2_compression_opts[opt.type]); - if (opt.level) - prt_printf(out, ":%u", opt.level); +int bch2_opt_compression_validate(u64 v, struct printbuf *err) +{ + if (!bch2_compression_opt_valid(v)) { + prt_printf(err, "invalid compression opt %llu", v); + return -BCH_ERR_invalid_sb_opt_compression; + } + + return 0; } diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h index 052ea30..607fd5e 100644 --- a/libbcachefs/compress.h +++ b/libbcachefs/compress.h @@ -4,12 +4,18 @@ #include "extents_types.h" +static const unsigned __bch2_compression_opt_to_type[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + struct bch_compression_opt { u8 type:4, level:4; }; -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) { return (struct bch_compression_opt) { .type = v & 15, @@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v) }; } +static inline bool bch2_compression_opt_valid(unsigned v) +{ + struct bch_compression_opt opt = __bch2_compression_decode(v); + + return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); +} + +static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +{ + return bch2_compression_opt_valid(v) + ? __bch2_compression_decode(v) + : (struct bch_compression_opt) { 0 }; +} + static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) { return opt.type|(opt.level << 4); } -static const unsigned __bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) { return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; @@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); void bch2_fs_compress_exit(struct bch_fs *); int bch2_fs_compress_init(struct bch_fs *); +void bch2_compression_opt_to_text(struct printbuf *, u64); + int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +int bch2_opt_compression_validate(u64, struct printbuf *); #define bch2_opt_compression (struct bch_opt_fn) { \ - .parse = bch2_opt_compression_parse, \ - .to_text = bch2_opt_compression_to_text, \ + .parse = bch2_opt_compression_parse, \ + .to_text = bch2_opt_compression_to_text, \ + .validate = bch2_opt_compression_validate, \ } #endif /* _BCACHEFS_COMPRESS_H */ diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c index 442a9b8..02a996e 100644 --- a/libbcachefs/counters.c +++ b/libbcachefs/counters.c @@ -43,12 +43,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, prt_tab(out); prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); prt_newline(out); - }; + } }; int bch2_sb_counters_to_cpu(struct bch_fs *c) { - struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); u64 val = 0; @@ -66,13 +66,13 @@ int bch2_sb_counters_to_cpu(struct bch_fs *c) int bch2_sb_counters_from_cpu(struct bch_fs *c) { - struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_resize_counters(&c->disk_sb, + ret = bch2_sb_field_resize(&c->disk_sb, counters, sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 114f86b..43ea21a 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -8,7 +8,6 @@ * Inspired by CCAN's darray */ -#include "util.h" #include #define DARRAY(type) \ @@ -19,20 +18,25 @@ struct { \ typedef DARRAY(void) darray_void; -static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) +int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t); + +static inline int __darray_resize(darray_void *d, size_t element_size, + size_t new_size, gfp_t gfp) { - if (d->nr + more > d->size) { - size_t new_size = roundup_pow_of_two(d->nr + more); - void *data = krealloc_array(d->data, new_size, t_size, gfp); + return unlikely(new_size > d->size) + ? __bch2_darray_resize(d, element_size, new_size, gfp) + : 0; +} - if (!data) - return -ENOMEM; +#define darray_resize_gfp(_d, _new_size, _gfp) \ + __darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) - d->data = data; - d->size = new_size; - } +#define darray_resize(_d, _new_size) \ + darray_resize_gfp(_d, _new_size, GFP_KERNEL) - return 0; +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) +{ + return __darray_resize(d, t_size, d->nr + more, gfp); } #define darray_make_room_gfp(_d, _more, _gfp) \ @@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, #define darray_make_room(_d, _more) \ darray_make_room_gfp(_d, _more, GFP_KERNEL) +#define darray_room(_d) ((_d).size - (_d).nr) + #define darray_top(_d) ((_d).data[(_d).nr]) #define darray_push_gfp(_d, _item, _gfp) \ @@ -69,9 +75,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, _ret; \ }) +#define darray_remove_item(_d, _pos) \ + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + #define darray_for_each(_d, _i) \ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) +#define darray_for_each_reverse(_d, _i) \ + for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + #define darray_init(_d) \ do { \ (_d)->data = NULL; \ diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index cfc6244..55769d7 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -9,10 +9,11 @@ #include "ec.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "keylist.h" #include "move.h" #include "nocow_locking.h" +#include "rebalance.h" #include "subvolume.h" #include "trace.h" @@ -49,10 +50,6 @@ static void trace_move_extent_fail2(struct data_update *m, if (insert) { i = 0; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - struct bkey_s new_s; - new_s.k = (void *) new.k; - new_s.v = (void *) new.v; - if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) @@ -165,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); - /* - * See comment below: bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); - */ rewrites_found |= 1U << i; } i++; @@ -215,14 +208,8 @@ restart_drop_extra_replicas: if (!p.ptr.cached && durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); - /* - * Currently, we're dropping unneeded replicas - * instead of marking them as cached, since - * cached data in stripe buckets prevents them - * from being reused: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); - */ goto restart_drop_extra_replicas; } } @@ -252,19 +239,47 @@ restart_drop_extra_replicas: next_pos = insert->k.p; + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p); - if (ret) - goto err; - - ret = bch2_trans_update(trans, &iter, insert, + k.k->p, insert->k.p) ?: + bch2_bkey_set_needs_rebalance(c, insert, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); @@ -285,11 +300,11 @@ next: } continue; nowork: - if (m->ctxt && m->ctxt->stats) { + if (m->stats && m->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_inc(&m->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, - &m->ctxt->stats->sectors_raced); + &m->stats->sectors_raced); } this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); @@ -307,7 +322,7 @@ out: int bch2_data_update_index_update(struct bch_write_op *op) { - return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } void bch2_data_update_read_done(struct data_update *m, @@ -415,7 +430,7 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, break; } - if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { + if (closure_nr_remaining(&cl) != 1) { bch2_trans_unlock(trans); closure_sync(&cl); } @@ -443,6 +458,8 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; m->data_opts = data_opts; + m->ctxt = ctxt; + m->stats = ctxt ? ctxt->stats : NULL; bch2_write_op_init(&m->op, c, io_opts); m->op.pos = bkey_start_pos(k.k); @@ -491,7 +508,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (c->opts.nocow_enabled) { if (ctxt) { - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || !atomic_read(&ctxt->read_sectors)); diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index 49e9055..9dc17b9 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -4,7 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" -#include "io_types.h" +#include "io_write_types.h" struct moving_context; @@ -23,6 +23,7 @@ struct data_update { struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; + struct bch_move_stats *stats; struct bch_write_op op; }; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index ae47e18..57c5128 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -19,7 +19,6 @@ #include "extents.h" #include "fsck.h" #include "inode.h" -#include "io.h" #include "super.h" #include @@ -154,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) BUG_ON(b->nsets != 1); for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) - if (k->type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); - v->mem_ptr = 0; - } + if (k->type == KEY_TYPE_btree_ptr_v2) + ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; v = c->verify_data; bkey_copy(&v->key, &b->key); @@ -322,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); - int err = copy_to_user(i->ubuf, i->buf.buf, bytes); + int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); - if (err) - return err; + i->ret += copied; + i->ubuf += copied; + i->size -= copied; + i->buf.pos -= copied; + memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - i->ret += bytes; - i->ubuf += bytes; - i->size -= bytes; - i->buf.pos -= bytes; - memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); + if (copied != bytes) + return -EFAULT; } return i->size ? 0 : i->ret; @@ -369,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; ssize_t ret; @@ -382,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (ret) return ret; - bch2_trans_init(&trans, i->c, 0, 0); - ret = for_each_btree_key2(&trans, iter, i->id, i->from, + trans = bch2_trans_get(i->c); + ret = for_each_btree_key2(trans, iter, i->id, i->from, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); - drop_locks_do(&trans, flush_buf(i)); + drop_locks_do(trans, flush_buf(i)); })); i->from = iter.pos; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); @@ -411,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct btree *b; ssize_t ret; @@ -427,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - bch2_trans_init(&trans, i->c, 0, 0); + trans = bch2_trans_get(i->c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { bch2_btree_node_to_text(&i->buf, i->c, b); i->from = !bpos_eq(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; - ret = drop_locks_do(&trans, flush_buf(i)); + ret = drop_locks_do(trans, flush_buf(i)); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); @@ -465,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; ssize_t ret; @@ -478,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (ret) return ret; - bch2_trans_init(&trans, i->c, 0, 0); + trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(&trans, iter, i->id, i->from, + ret = for_each_btree_key2(trans, iter, i->id, i->from, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct btree_path_level *l = &iter.path->l[0]; @@ -493,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, } bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(&trans, flush_buf(i)); + drop_locks_do(trans, flush_buf(i)); })); i->from = iter.pos; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); @@ -520,7 +517,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "%px btree=%s l=%u ", b, - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level); prt_newline(out); @@ -922,18 +919,18 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_ids[bd->id], + debugfs_create_file(bch2_btree_id_str(bd->id), 0400, c->btree_debug_dir, bd, &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", - bch2_btree_ids[bd->id]); + bch2_btree_id_str(bd->id)); debugfs_create_file(name, 0400, c->btree_debug_dir, bd, &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_ids[bd->id]); + bch2_btree_id_str(bd->id)); debugfs_create_file(name, 0400, c->btree_debug_dir, bd, &bfloat_failed_debug_ops); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 065ea59..0542d99 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -13,12 +13,25 @@ #include -unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) +static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { - unsigned len = bkey_val_bytes(d.k) - - offsetof(struct bch_dirent, d_name); + unsigned bkey_u64s = bkey_val_u64s(d.k); + unsigned bkey_bytes = bkey_u64s * sizeof(u64); + u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; +#if CPU_BIG_ENDIAN + unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; +#else + unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; +#endif + + return bkey_bytes - + offsetof(struct bch_dirent, d_name) - + trailing_nuls; +} - return strnlen(d.v->d_name, len); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) +{ + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -41,7 +54,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + struct qstr name = bch2_dirent_get_name(d); return bch2_dirent_hash(info, &name); } @@ -49,20 +62,20 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - int len = bch2_dirent_name_bytes(l); - const struct qstr *r = _r; + const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr *r_name = _r; - return len - r->len ?: memcmp(l.v->d_name, r->name, len); + return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); } static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - int l_len = bch2_dirent_name_bytes(l); - int r_len = bch2_dirent_name_bytes(r); + const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr r_name = bch2_dirent_get_name(r); - return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); + return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); } static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) @@ -84,63 +97,62 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - unsigned len; - - len = bch2_dirent_name_bytes(d); - if (!len) { - prt_printf(err, "empty name"); - return -BCH_ERR_invalid_bkey; - } - - if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(len)); - return -BCH_ERR_invalid_bkey; - } - - if (len > BCH_NAME_MAX) { - prt_printf(err, "dirent name too big (%u > %u)", - len, BCH_NAME_MAX); - return -BCH_ERR_invalid_bkey; - } - - if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } - - if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } + struct qstr d_name = bch2_dirent_get_name(d); + int ret = 0; - if (memchr(d.v->d_name, '/', len)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(!d_name.len, c, err, + dirent_empty_name, + "empty name"); - if (d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode) { - prt_printf(err, "dirent points to own directory"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, + dirent_val_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); - return 0; + /* + * Check new keys don't exceed the max length + * (older keys may be larger.) + */ + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + dirent_name_too_long, + "dirent name too big (%u > %u)", + d_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, + dirent_name_embedded_nul, + "dirent has stray data after name's NUL"); + + bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, + dirent_name_dot_or_dotdot, + "invalid name"); + + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, + dirent_name_has_slash, + "name with /"); + + bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, + dirent_to_itself, + "dirent points to own directory"); +fsck_err: + return ret; } void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); prt_printf(out, "%.*s -> %llu type %s", - bch2_dirent_name_bytes(d), - d.v->d_name, + d_name.len, + d_name.name, d.v->d_type != DT_SUBVOL ? le64_to_cpu(d.v->d_inum) : le32_to_cpu(d.v->d_child_subvol), @@ -189,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, int flags) + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) { struct bkey_i_dirent *dirent; int ret; @@ -200,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); + dir, &dirent->k_i, str_hash_flags); *dir_offset = dirent->k.p.offset; return ret; @@ -457,21 +470,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, const struct qstr *name, subvol_inum *inum) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret) - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -500,25 +511,25 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; subvol_inum target; u32 snapshot; struct bkey_buf sk; + struct qstr name; int ret; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, SPOS(inum.inum, ctx->pos, snapshot), POS(inum.inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) @@ -526,7 +537,7 @@ retry: dirent = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inum, dirent, &target); + ret = bch2_dirent_read_target(trans, inum, dirent, &target); if (ret < 0) break; if (ret) @@ -535,11 +546,13 @@ retry: /* dir_emit() can fault and block: */ bch2_bkey_buf_reassemble(&sk, c, k); dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); + + name = bch2_dirent_get_name(dirent); ctx->pos = dirent.k->p.offset; - if (!dir_emit(ctx, dirent.v->d_name, - bch2_dirent_name_bytes(dirent), + if (!dir_emit(ctx, name.name, + name.len, target.inum, vfs_d_type(dirent.v->d_type))) break; @@ -549,16 +562,16 @@ retry: * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it */ - ret = btree_trans_too_many_iters(&trans); + ret = btree_trans_too_many_iters(trans); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index b42f4a1..8a55245 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -7,7 +7,7 @@ enum bkey_invalid_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -24,7 +24,7 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; -unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); static inline unsigned dirent_val_u64s(unsigned len) { @@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum, int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, int); + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); static inline unsigned vfs_d_type(unsigned type) { diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index de14ca3..4d0cb0c 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "disk_groups.h" +#include "sb-members.h" #include "super-io.h" #include @@ -24,28 +25,27 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g, *sorted = NULL; - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); unsigned nr_groups = disk_groups_nr(groups); unsigned i, len; int ret = 0; for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - unsigned g; + struct bch_member m = bch2_sb_member_get(sb, i); + unsigned group_id; - if (!BCH_MEMBER_GROUP(m)) + if (!BCH_MEMBER_GROUP(&m)) continue; - g = BCH_MEMBER_GROUP(m) - 1; + group_id = BCH_MEMBER_GROUP(&m) - 1; - if (g >= nr_groups) { + if (group_id >= nr_groups) { prt_printf(err, "disk %u has invalid label %u (have %u)", - i, g, nr_groups); + i, group_id, nr_groups); return -BCH_ERR_invalid_sb_disk_groups; } - if (BCH_GROUP_DELETED(&groups->entries[g])) { - prt_printf(err, "disk %u has deleted label %u", i, g); + if (BCH_GROUP_DELETED(&groups->entries[group_id])) { + prt_printf(err, "disk %u has deleted label %u", i, group_id); return -BCH_ERR_invalid_sb_disk_groups; } } @@ -151,22 +151,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) { - struct bch_sb_field_members *mi; struct bch_sb_field_disk_groups *groups; struct bch_disk_groups_cpu *cpu_g, *old_g; unsigned i, g, nr_groups; lockdep_assert_held(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); - groups = bch2_sb_get_disk_groups(c->disk_sb.sb); + groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); nr_groups = disk_groups_nr(groups); if (!groups) return 0; - cpu_g = kzalloc(sizeof(*cpu_g) + - sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); + cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) return -BCH_ERR_ENOMEM_disk_groups_to_cpu; @@ -178,17 +175,17 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) dst->deleted = BCH_GROUP_DELETED(src); dst->parent = BCH_GROUP_PARENT(src); + memcpy(dst->label, src->label, sizeof(dst->label)); } for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - struct bch_disk_group_cpu *dst = - &cpu_g->entries[BCH_MEMBER_GROUP(m)]; + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); + struct bch_disk_group_cpu *dst; - if (!bch2_member_exists(m)) + if (!bch2_member_exists(&m)) continue; - g = BCH_MEMBER_GROUP(m); + g = BCH_MEMBER_GROUP(&m); while (g) { dst = &cpu_g->entries[g - 1]; __set_bit(i, dst->devs.d); @@ -299,7 +296,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, const char *name, unsigned namelen) { struct bch_sb_field_disk_groups *groups = - bch2_sb_get_disk_groups(sb->sb); + bch2_sb_field_get(sb->sb, disk_groups); unsigned i, nr_groups = disk_groups_nr(groups); struct bch_disk_group *g; @@ -317,7 +314,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, sizeof(struct bch_disk_group) * (nr_groups + 1)) / sizeof(u64); - groups = bch2_sb_resize_disk_groups(sb, u64s); + groups = bch2_sb_field_resize(sb, disk_groups, u64s); if (!groups) return -BCH_ERR_ENOSPC_disk_label_add; @@ -341,7 +338,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) { struct bch_sb_field_disk_groups *groups = - bch2_sb_get_disk_groups(sb->sb); + bch2_sb_field_get(sb->sb, disk_groups); int v = -1; do { @@ -371,7 +368,7 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) if (*next == '.') next++; - groups = bch2_sb_get_disk_groups(sb->sb); + groups = bch2_sb_field_get(sb->sb, disk_groups); v = __bch2_disk_group_find(groups, parent, name, len); if (v < 0) @@ -386,10 +383,60 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + struct bch_disk_groups_cpu *groups; + struct bch_disk_group_cpu *g; + unsigned nr = 0; + u16 path[32]; + + out->atomic++; + rcu_read_lock(); + groups = rcu_dereference(c->disk_groups); + if (!groups) + goto invalid; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto invalid; + + if (v >= groups->nr) + goto invalid; + + g = groups->entries + v; + + if (g->deleted) + goto invalid; + + path[nr++] = v; + + if (!g->parent) + break; + + v = g->parent - 1; + } + + while (nr) { + v = path[--nr]; + g = groups->entries + v; + + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + prt_printf(out, "."); + } +out: + rcu_read_unlock(); + out->atomic--; + return; +invalid: + prt_printf(out, "invalid label %u", v); + goto out; +} + +void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct bch_sb_field_disk_groups *groups = - bch2_sb_get_disk_groups(sb); + bch2_sb_field_get(sb, disk_groups); struct bch_disk_group *g; unsigned nr = 0; u16 path[32]; @@ -443,7 +490,7 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) if (ret) return ret; - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_GROUP(mi, v + 1); return 0; } @@ -497,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, return -EINVAL; } -void bch2_opt_target_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) +void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { struct target t = target_decode(v); @@ -508,48 +552,71 @@ void bch2_opt_target_to_text(struct printbuf *out, case TARGET_NULL: prt_printf(out, "none"); break; - case TARGET_DEV: - if (c) { - struct bch_dev *ca; - - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && percpu_ref_tryget(&ca->io_ref)) { - prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); - percpu_ref_put(&ca->io_ref); - } else if (ca) { - prt_printf(out, "offline device %u", t.dev); - } else { - prt_printf(out, "invalid device %u", t.dev); - } - - rcu_read_unlock(); + case TARGET_DEV: { + struct bch_dev *ca; + + out->atomic++; + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + prt_printf(out, "offline device %u", t.dev); } else { - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_member *m = mi->members + t.dev; - - if (bch2_dev_exists(sb, mi, t.dev)) { - prt_printf(out, "Device "); - pr_uuid(out, m->uuid.b); - prt_printf(out, " (%u)", t.dev); - } else { - prt_printf(out, "Bad device %u", t.dev); - } + prt_printf(out, "invalid device %u", t.dev); } + + rcu_read_unlock(); + out->atomic--; break; + } case TARGET_GROUP: - if (c) { - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); - mutex_unlock(&c->sb_lock); + bch2_disk_path_to_text(out, c, t.group); + break; + default: + BUG(); + } +} + +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct target t = target_decode(v); + + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); + break; + case TARGET_DEV: { + struct bch_member m = bch2_sb_member_get(sb, t.dev); + + if (bch2_dev_exists(sb, t.dev)) { + prt_printf(out, "Device "); + pr_uuid(out, m.uuid.b); + prt_printf(out, " (%u)", t.dev); } else { - bch2_disk_path_to_text(out, sb, t.group); + prt_printf(out, "Bad device %u", t.dev); } break; + } + case TARGET_GROUP: + bch2_disk_path_to_text_sb(out, sb, t.group); + break; default: BUG(); } } + +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + if (c) + bch2_target_to_text(out, c, v); + else + bch2_target_to_text_sb(out, sb, v); +} diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index bd77117..441826f 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_DISK_GROUPS_H #define _BCACHEFS_DISK_GROUPS_H +#include "disk_groups_types.h" + extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) @@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *); /* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); +void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); +void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); + +void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f58e84a..c730f09 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -11,10 +11,11 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "checksum.h" #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_read.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -104,29 +105,26 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + int ret = 0; - if (bkey_eq(k.k->p, POS_MIN)) { - prt_printf(err, "stripe at POS_MIN"); - return -BCH_ERR_invalid_bkey; - } - - if (k.k->p.inode) { - prt_printf(err, "nonzero inode field"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || + bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, + stripe_pos_bad, + "stripe at bad pos"); - if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { - prt_printf(err, "incorrect value size (%zu < %u)", - bkey_val_u64s(k.k), stripe_val_u64s(s)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, + stripe_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(k.k), stripe_val_u64s(s)); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -152,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); if (i < nr_data) prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + prt_printf(out, " gen %u", ptr->gen); if (ptr_stale(ca, ptr)) prt_printf(out, " stale"); } @@ -305,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf buf2 = PRINTBUF; + struct printbuf err = PRINTBUF; + struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + + prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", + want.hi, want.lo, + got.hi, got.lo, + bch2_csum_types[v->csum_type]); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); - - bch_err_ratelimited(c, - "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", - (void *) _RET_IP_, i, j, v->csum_type, - want.lo, got.lo, buf2.buf); - printbuf_exit(&buf2); clear_bit(i, buf->valid); + + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); break; } @@ -372,7 +376,11 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "erasure coding %s error: %s", bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -473,14 +481,10 @@ err: return ret; } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -{ - return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); -} - /* recovery read path: */ -int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) { + struct bch_fs *c = trans->c; struct ec_stripe_buf *buf; struct closure cl; struct bch_stripe *v; @@ -495,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) if (!buf) return -BCH_ERR_ENOMEM_ec_read_extent; - ret = get_stripe_key(c, rbio->pick.ec.idx, buf); + ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { bch_err_ratelimited(c, "error doing reconstruct read: error %i looking up stripe", ret); @@ -787,12 +791,10 @@ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret; u64 idx; - bch2_trans_init(&trans, c, 0, 0); - while (1) { mutex_lock(&c->ec_stripes_heap_lock); idx = stripe_idx_to_delete(c); @@ -801,15 +803,15 @@ static void ec_stripe_delete_work(struct work_struct *work) if (!idx) break; - ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_delete(&trans, idx)); + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + ec_stripe_delete(trans, idx)); if (ret) { bch_err_fn(c, ret); break; } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b while (1) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, s, &bp_pos)); if (ret) @@ -998,24 +1000,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush(&trans); + ret = bch2_btree_write_buffer_flush(trans); if (ret) goto err; for (i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(&trans, s, i); + ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1121,9 +1121,9 @@ static void ec_stripe_create(struct ec_stripe_new *s) } ret = bch2_trans_do(c, &s->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, - ec_stripe_key_update(&trans, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, + ec_stripe_key_update(trans, bkey_i_to_stripe(&s->new_stripe.key), !s->have_existing_stripe)); if (ret) { @@ -1133,8 +1133,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) ret = ec_stripe_update_extents(c, &s->new_stripe); if (ret) { - bch_err(c, "error creating stripe: error updating pointers: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "creating stripe: error updating pointers"); goto err; } err: @@ -1374,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1425,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1682,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, int ret; h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); - if (!h) - bch_err(c, "no stripe head"); if (IS_ERR_OR_NULL(h)) return h; @@ -1822,7 +1833,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; const struct bch_stripe *s; @@ -1830,9 +1841,7 @@ int bch2_stripes_read(struct bch_fs *c) unsigned i; int ret; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { if (k.k->type != KEY_TYPE_stripe) continue; @@ -1855,9 +1864,9 @@ int bch2_stripes_read(struct bch_fs *c) bch2_stripes_heap_insert(c, m, k.k->p.offset); } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 885ae5d..7d0237c 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -8,7 +8,7 @@ enum bkey_invalid_flags; -int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -199,7 +199,7 @@ struct ec_stripe_head { struct ec_stripe_new *s; }; -int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); @@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, bch2_ec_do_stripe_creates(c); break; default: - unreachable(); + BUG(); } } diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c index dc906fc..d260ff9 100644 --- a/libbcachefs/errcode.c +++ b/libbcachefs/errcode.c @@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = { NULL }; -#define BCH_ERR_0 0 - static unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() @@ -61,3 +59,10 @@ int __bch2_err_class(int err) return -err; } + +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 735eb24..e5c3262 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -3,6 +3,8 @@ #define _BCACHEFS_ERRCODE_H #define BCH_ERRCODES() \ + x(ERANGE, ERANGE_option_too_small) \ + x(ERANGE, ERANGE_option_too_big) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -71,7 +73,6 @@ x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_journal_replay) \ x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \ @@ -91,6 +92,7 @@ x(ENOSPC, ENOSPC_sb_quota) \ x(ENOSPC, ENOSPC_sb_replicas) \ x(ENOSPC, ENOSPC_sb_members) \ + x(ENOSPC, ENOSPC_sb_members_v2) \ x(ENOSPC, ENOSPC_sb_crypt) \ x(ENOSPC, ENOSPC_btree_slot) \ x(ENOSPC, ENOSPC_snapshot_tree) \ @@ -99,6 +101,7 @@ x(ENOENT, ENOENT_str_hash_set_must_replace) \ x(ENOENT, ENOENT_inode) \ x(ENOENT, ENOENT_not_subvol) \ + x(ENOENT, ENOENT_not_directory) \ x(ENOENT, ENOENT_directory_dead) \ x(ENOENT, ENOENT_subvolume) \ x(ENOENT, ENOENT_snapshot_tree) \ @@ -211,8 +214,23 @@ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ x(BCH_ERR_invalid_sb, invalid_sb_clean) \ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid_sb, invalid_sb_errors) \ + x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ + x(0, nopromote) \ + x(BCH_ERR_nopromote, nopromote_may_not) \ + x(BCH_ERR_nopromote, nopromote_already_promoted) \ + x(BCH_ERR_nopromote, nopromote_unwritten) \ + x(BCH_ERR_nopromote, nopromote_congested) \ + x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_enomem) enum bch_errcode { BCH_ERR_START = 2048, @@ -243,4 +261,8 @@ static inline long bch2_err_class(long err) return err < 0 ? __bch2_err_class(err) : err; } +#define BLK_STS_REMOVED ((__force blk_status_t)128) + +const char *bch2_blk_status_to_str(blk_status_t); + #endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 39009cf..7b28d37 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "io.h" #include "super.h" #define FSCK_ERR_RATELIMIT_NR 10 @@ -57,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work) up_write(&c->state_lock); } -void bch2_io_error(struct bch_dev *ca) +void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { + atomic64_inc(&ca->errors[type]); //queue_work(system_long_wq, &ca->io_error_work); } @@ -117,31 +117,34 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) return NULL; - list_for_each_entry(s, &c->fsck_errors, list) + list_for_each_entry(s, &c->fsck_error_msgs, list) if (s->fmt == fmt) { /* * move it to the head of the list: repeated fsck errors * are common */ - list_move(&s->list, &c->fsck_errors); + list_move(&s->list, &c->fsck_error_msgs); return s; } s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) { - if (!c->fsck_alloc_err) + if (!c->fsck_alloc_msgs_err) bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); - c->fsck_alloc_err = true; + c->fsck_alloc_msgs_err = true; return NULL; } INIT_LIST_HEAD(&s->list); s->fmt = fmt; - list_add(&s->list, &c->fsck_errors); + list_add(&s->list, &c->fsck_error_msgs); return s; } -int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +int bch2_fsck_err(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) { struct fsck_err_state *s = NULL; va_list args; @@ -149,11 +152,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + bch2_sb_error_count(c, err); + va_start(args, fmt); prt_vprintf(out, fmt, args); va_end(args); - mutex_lock(&c->fsck_error_lock); + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { /* @@ -163,7 +168,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) */ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); printbuf_exit(&buf); return ret; } @@ -258,7 +263,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) if (s) s->ret = ret; - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); printbuf_exit(&buf); @@ -279,9 +284,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c) { struct fsck_err_state *s, *n; - mutex_lock(&c->fsck_error_lock); + mutex_lock(&c->fsck_error_msgs_lock); - list_for_each_entry_safe(s, n, &c->fsck_errors, list) { + list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { if (s->ratelimited && s->last_msg) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); @@ -290,5 +295,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c) kfree(s); } - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 7ce9540..d167d65 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -4,6 +4,7 @@ #include #include +#include "sb-errors.h" struct bch_dev; struct bch_fs; @@ -101,18 +102,26 @@ struct fsck_err_state { char *last_msg; }; -#define FSCK_CAN_FIX (1 << 0) -#define FSCK_CAN_IGNORE (1 << 1) -#define FSCK_NEED_FSCK (1 << 2) -#define FSCK_NO_RATELIMIT (1 << 3) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, +}; + +#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -__printf(3, 4) __cold -int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); +__printf(4, 5) __cold +int bch2_fsck_err(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); void bch2_flush_fsck_errs(struct bch_fs *); -#define __fsck_err(c, _flags, msg, ...) \ +#define __fsck_err(c, _flags, _err_type, ...) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ + int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ + __VA_ARGS__); \ \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ @@ -127,26 +136,53 @@ void bch2_flush_fsck_errs(struct bch_fs *); /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -#define __fsck_err_on(cond, c, _flags, ...) \ - (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) +#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) + +#define need_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define need_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) -#define need_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -#define need_fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -#define mustfix_fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) +static inline void bch2_bkey_fsck_err(struct bch_fs *c, + struct printbuf *err_msg, + enum bch_sb_error_id err_type, + const char *fmt, ...) +{ + va_list args; -#define mustfix_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) + va_start(args, fmt); + prt_vprintf(err_msg, fmt, args); + va_end(args); -#define fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) +} -#define fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) +#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +do { \ + prt_printf(_err_msg, __VA_ARGS__); \ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ + ret = -BCH_ERR_invalid_bkey; \ + goto fsck_err; \ +} while (0) + +#define bkey_fsck_err_on(cond, ...) \ +do { \ + if (unlikely(cond)) \ + bkey_fsck_err(__VA_ARGS__); \ +} while (0) /* * Fatal errors: these don't indicate a bug, but we can't continue running in RW @@ -179,26 +215,26 @@ do { \ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ -void bch2_io_error(struct bch_dev *); +void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, ...) \ +#define bch2_dev_io_err_on(cond, ca, _type, ...) \ ({ \ bool _ret = (cond); \ \ if (_ret) { \ bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca); \ + bch2_io_error(ca, _type); \ } \ _ret; \ }) -#define bch2_dev_inum_io_err_on(cond, ca, ...) \ +#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ ({ \ bool _ret = (cond); \ \ if (_ret) { \ bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca); \ + bch2_io_error(ca, _type); \ } \ _ret; \ }) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c13e0af..a864de2 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "buckets.h" #include "checksum.h" +#include "compress.h" #include "debug.h" #include "disk_groups.h" #include "error.h" @@ -162,17 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, + btree_ptr_val_too_big, + "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -181,17 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { - prt_printf(err, "value too big (%zu > %zu)", - bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, + btree_ptr_v2_val_too_big, + "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -372,19 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + int ret = 0; - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { - prt_printf(err, "invalid nr_replicas (%u)", - r.v->nr_replicas); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, + reservation_key_nr_replicas_invalid, + "invalid nr_replicas (%u)", r.v->nr_replicas); +fsck_err: + return ret; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -517,13 +522,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, switch (type) { case BCH_EXTENT_ENTRY_crc32: set_common_fields(dst->crc32, src); - memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); + dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); break; case BCH_EXTENT_ENTRY_crc64: set_common_fields(dst->crc64, src); dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + dst->crc64.csum_lo = (u64 __force) src.csum.lo; + dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); break; case BCH_EXTENT_ENTRY_crc128: set_common_fields(dst->crc128, src); @@ -757,18 +762,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -{ - union bch_extent_entry *next = extent_entry_next(entry); - - /* stripes have ptrs, but their layout doesn't work with this code */ - BUG_ON(k.k->type == KEY_TYPE_stripe); - - memmove_u64s_down(entry, next, - (u64 *) bkey_val_end(k) - (u64 *) next); - k.k->u64s -= (u64 *) next - (u64 *) entry; -} - /* * Returns pointer to the next entry after the one being dropped: */ @@ -915,11 +908,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return true; + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; return false; } else { @@ -992,10 +985,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - const struct bch_extent_stripe_ptr *ec; - struct bch_dev *ca; bool first = true; if (c) @@ -1006,9 +995,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *ptr = entry_to_ptr(entry); + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; @@ -1030,10 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " stale"); } break; + } case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, @@ -1042,12 +1033,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_csum_types[crc.csum_type], bch2_compression_types[crc.compression_type]); break; - case BCH_EXTENT_ENTRY_stripe_ptr: - ec = &entry->stripe_ptr; + } + case BCH_EXTENT_ENTRY_stripe_ptr: { + const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; + } + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + prt_str(out, "rebalance: target "); + if (c) + bch2_target_to_text(out, c, r->target); + else + prt_printf(out, "%u", r->target); + prt_str(out, " compression "); + bch2_compression_opt_to_text(out, r->compression); + break; + } default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1057,8 +1062,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static int extent_ptr_invalid(const struct bch_fs *c, +static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata, @@ -1069,43 +1075,44 @@ static int extent_ptr_invalid(const struct bch_fs *c, u64 bucket; u32 bucket_offset; struct bch_dev *ca; + int ret = 0; if (!bch2_dev_exists2(c, ptr->dev)) { - prt_printf(err, "pointer to invalid device (%u)", ptr->dev); - return -BCH_ERR_invalid_bkey; + /* + * If we're in the write path this key might have already been + * overwritten, and we could be seeing a device that doesn't + * exist anymore due to racing with device removal: + */ + if (flags & BKEY_INVALID_WRITE) + return 0; + + bkey_fsck_err(c, err, ptr_to_invalid_device, + "pointer to invalid device (%u)", ptr->dev); } ca = bch_dev_bkey_exists(c, ptr->dev); bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) { - prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, + ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - if (bucket >= ca->mi.nbuckets) { - prt_printf(err, "pointer past last bucket (%llu > %llu)", - bucket, ca->mi.nbuckets); - return -BCH_ERR_invalid_bkey; - } - - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { - prt_printf(err, "pointer before first bucket (%llu < %u)", - bucket, ca->mi.first_bucket); - return -BCH_ERR_invalid_bkey; - } - - if (bucket_offset + size_ondisk > ca->mi.bucket_size) { - prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); + bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + ptr_spans_multiple_buckets, + "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, ca->mi.bucket_size); - return -BCH_ERR_invalid_bkey; - } - - return 0; +fsck_err: + return ret; } -int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -1115,48 +1122,39 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; - bool unwritten = false, have_ec = false, crc_since_last_ptr = false; - int ret; + bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret = 0; if (bkey_is_btree_ptr(k.k)) size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { - prt_printf(err, "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, + extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - if (bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry)) { - prt_printf(err, "has non ptr field"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry), c, err, + btree_ptr_has_non_ptr, + "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, - false, err); + ret = extent_ptr_invalid(c, k, flags, &entry->ptr, + size_ondisk, false, err); if (ret) return ret; - if (nr_ptrs && unwritten != entry->ptr.unwritten) { - prt_printf(err, "extent with unwritten and written ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { - prt_printf(err, "has unwritten ptrs"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, + ptr_cached_and_erasure_coded, + "cached, erasure coded ptr"); - if (entry->ptr.cached && have_ec) { - prt_printf(err, "cached, erasure coded ptr"); - return -BCH_ERR_invalid_bkey; - } + if (!entry->ptr.unwritten) + have_written = true; + else + have_unwritten = true; - unwritten = entry->ptr.unwritten; have_ec = false; crc_since_last_ptr = false; nr_ptrs++; @@ -1166,72 +1164,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - if (crc.offset + crc.live_size > - crc.uncompressed_size) { - prt_printf(err, "checksum offset + key size > uncompressed size"); - return -BCH_ERR_invalid_bkey; - } - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) { - prt_printf(err, "invalid checksum type"); - return -BCH_ERR_invalid_bkey; - } - - if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { - prt_printf(err, "invalid compression type"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, + ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, + ptr_crc_csum_type_unknown, + "invalid checksum type"); + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, + ptr_crc_compression_type_unknown, + "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) { - prt_printf(err, "incorrect nonce"); - return -BCH_ERR_invalid_bkey; - } + else if (nonce != crc.offset + crc.nonce) + bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + "incorrect nonce"); } - if (crc_since_last_ptr) { - prt_printf(err, "redundant crc entry"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(crc_since_last_ptr, c, err, + ptr_crc_redundant, + "redundant crc entry"); crc_since_last_ptr = true; + + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + + size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - if (have_ec) { - prt_printf(err, "redundant stripe entry"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(have_ec, c, err, + ptr_stripe_redundant, + "redundant stripe entry"); have_ec = true; break; - case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + if (!bch2_compression_opt_valid(r->compression)) { + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + prt_printf(err, "invalid compression opt %u:%u", + opt.type, opt.level); + return -BCH_ERR_invalid_bkey; + } break; } + } } - if (!nr_ptrs) { - prt_str(err, "no ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { - prt_str(err, "too many ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (crc_since_last_ptr) { - prt_printf(err, "redundant crc entry"); - return -BCH_ERR_invalid_bkey; - } - - if (have_ec) { - prt_printf(err, "redundant stripe entry"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!nr_ptrs, c, err, + extent_ptrs_no_ptrs, + "no ptrs"); + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, + extent_ptrs_too_many_ptrs, + "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); + bkey_fsck_err_on(have_written && have_unwritten, c, err, + extent_ptrs_written_and_unwritten, + "extent with unwritten and written ptrs"); + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, + extent_ptrs_unwritten, + "has unwritten ptrs"); + bkey_fsck_err_on(crc_since_last_ptr, c, err, + extent_ptrs_redundant_crc, + "redundant crc entry"); + bkey_fsck_err_on(have_ec, c, err, + extent_ptrs_redundant_stripe, + "redundant stripe entry"); +fsck_err: + return ret; } void bch2_ptr_swab(struct bkey_s k) @@ -1272,6 +1275,125 @@ void bch2_ptr_swab(struct bkey_s k) } } +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned rewrite_ptrs = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { + rewrite_ptrs = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= 1U << i; + i++; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) + rewrite_ptrs |= 1U << i; + i++; + } + } + + return rewrite_ptrs; +} + +bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + /* + * If it's an indirect extent, we don't delete the rebalance entry when + * done so that we know what options were applied - check if it still + * needs work done: + */ + if (r && + k.k->type == KEY_TYPE_reflink_v && + !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) + r = NULL; + + return r != NULL; +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, + unsigned target, unsigned compression) +{ + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *r; + bool needs_rebalance; + + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + /* get existing rebalance entry: */ + r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + if (r) { + if (k.k->type == KEY_TYPE_reflink_v) { + /* + * indirect extents: existing options take precedence, + * so that we don't move extents back and forth if + * they're referenced by different inodes with different + * options: + */ + if (r->target) + target = r->target; + if (r->compression) + compression = r->compression; + } + + r->target = target; + r->compression = compression; + } + + needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); + + if (needs_rebalance && !r) { + union bch_extent_entry *new = bkey_val_end(k); + + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; + new->rebalance.compression = compression; + new->rebalance.target = target; + new->rebalance.unused = 0; + k.k->u64s += extent_entry_u64s(new); + } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { + /* + * For indirect extents, don't delete the rebalance entry when + * we're finished so that we know we specifically moved it or + * compressed it to its current location/compression type + */ + extent_entry_drop(k, (union bch_extent_entry *) r); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 6e9d23a..a2ce8a3 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k, memcpy_u64s_small(dst, new, extent_entry_u64s(new)); } +static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; @@ -155,7 +167,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) common_fields(crc->crc32), }; - memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); + *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; return ret; } case BCH_EXTENT_ENTRY_crc64: { @@ -165,8 +177,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .csum.lo = (__force __le64) crc->crc64.csum_lo, }; - u16 hi = crc->crc64.csum_hi; - memcpy(&ret.csum.hi, &hi, sizeof(hi)); + *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; + return ret; } case BCH_EXTENT_ENTRY_crc128: { @@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); } +static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) +{ + return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); +} + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { @@ -383,12 +400,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, @@ -428,7 +445,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -520,6 +537,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) case KEY_TYPE_reflink_v: case KEY_TYPE_inline_data: case KEY_TYPE_indirect_inline_data: + case KEY_TYPE_error: return true; default: return false; @@ -632,6 +650,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) { + struct bch_extent_ptr *dest; + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); switch (k->k.type) { @@ -641,10 +661,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); + dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); + *dest = ptr; k->k.u64s++; break; default: @@ -687,11 +705,19 @@ void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, + unsigned, unsigned); +bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, + unsigned, unsigned); + /* Generic extent code: */ enum bch_extent_overlap { @@ -736,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static inline void extent_save(struct btree *b, struct bkey_packed *dst, - struct bkey *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = *src; - else - BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -} - #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index bb53054..4496cf9 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans, bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_UNLINKED; + new_inode->bi_flags |= BCH_INODE_unlinked; ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d433f4d..b0e8144 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -12,10 +12,12 @@ #include "extent_update.h" #include "fs.h" #include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "journal.h" -#include "io.h" +#include "io_misc.h" #include "keylist.h" #include "quota.h" #include "reflink.h" @@ -31,116 +33,9 @@ #include #include #include -#include #include -static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); - -struct folio_vec { - struct folio *fv_folio; - size_t fv_offset; - size_t fv_len; -}; - -static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -{ - - struct folio *folio = page_folio(bv.bv_page); - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + - bv.bv_offset; - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); - - return (struct folio_vec) { - .fv_folio = folio, - .fv_offset = offset, - .fv_len = len, - }; -} - -static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, - struct bvec_iter iter) -{ - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -} - -#define __bio_for_each_folio(bvl, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) - -/** - * bio_for_each_folio - iterate over folios within a bio - * - * Like other non-_all versions, this iterates over what bio->bi_iter currently - * points to. This version is for drivers, where the bio may have previously - * been split or cloned. - */ -#define bio_for_each_folio(bvl, bio, iter) \ - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) - -/* - * Use u64 for the end pos and sector helpers because if the folio covers the - * max supported range of the mapping, the start offset of the next folio - * overflows loff_t. This breaks much of the range based processing in the - * buffered write path. - */ -static inline u64 folio_end_pos(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - -static inline size_t folio_sectors(struct folio *folio) -{ - return PAGE_SECTORS << folio_order(folio); -} - -static inline loff_t folio_sector(struct folio *folio) -{ - return folio_pos(folio) >> 9; -} - -static inline u64 folio_end_sector(struct folio *folio) -{ - return folio_end_pos(folio) >> 9; -} - -typedef DARRAY(struct folio *) folios; - -static int filemap_get_contig_folios_d(struct address_space *mapping, - loff_t start, u64 end, - int fgp_flags, gfp_t gfp, - folios *folios) -{ - struct folio *f; - u64 pos = start; - int ret = 0; - - while (pos < end) { - if ((u64) pos >= (u64) start + (1ULL << 20)) - fgp_flags &= ~FGP_CREAT; - - ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); - if (ret) - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR_OR_NULL(f)) - break; - - BUG_ON(folios->nr && folio_pos(f) != pos); - - pos = folio_end_pos(f); - darray_push(folios, f); - } - - if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) - ret = -ENOMEM; - - return folios->nr ? 0 : ret; -} - struct nocow_flush { struct closure *cl; struct bch_dev *ca; @@ -157,9 +52,9 @@ static void nocow_flush_endio(struct bio *_bio) bio_put(&bio->bio); } -static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - struct bch_inode_info *inode, - struct closure *cl) +void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + struct bch_inode_info *inode, + struct closure *cl) { struct nocow_flush *bio; struct bch_dev *ca; @@ -190,2586 +85,84 @@ static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, struct nocow_flush, bio); bio->cl = cl; bio->ca = ca; - bio->bio.bi_end_io = nocow_flush_endio; - closure_bio_submit(&bio->bio, cl); - } -} - -static int bch2_inode_flush_nocow_writes(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct closure cl; - - closure_init_stack(&cl); - bch2_inode_flush_nocow_writes_async(c, inode, &cl); - closure_sync(&cl); - - return 0; -} - -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - return false; -} - -static inline struct address_space *faults_disabled_mapping(void) -{ - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -} - -static inline void set_fdm_dropped_locks(void) -{ - current->faults_disabled_mapping = - (void *) (((unsigned long) current->faults_disabled_mapping)|1); -} - -static inline bool fdm_dropped_locks(void) -{ - return ((unsigned long) current->faults_disabled_mapping) & 1; -} - -struct quota_res { - u64 sectors; -}; - -struct bch_writepage_io { - struct bch_inode_info *inode; - - /* must be last: */ - struct bch_write_op op; -}; - -struct dio_write { - struct kiocb *req; - struct address_space *mapping; - struct bch_inode_info *inode; - struct mm_struct *mm; - unsigned loop:1, - extending:1, - sync:1, - flush:1, - free_iov:1; - struct quota_res quota_res; - u64 written; - - struct iov_iter iter; - struct iovec inline_vecs[2]; - - /* must be last: */ - struct bch_write_op op; -}; - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - bool should_dirty; - struct bch_read_bio rbio; -}; - -/* pagecache_block must be held */ -static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -/* quotas */ - -#ifdef CONFIG_BCACHEFS_QUOTA - -static void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - BUG_ON(res->sectors > inode->ei_quota_reserved); - - bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); - inode->ei_quota_reserved -= res->sectors; - res->sectors = 0; -} - -static void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - u64 sectors, - bool check_enospc) -{ - int ret; - - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) - return 0; - - mutex_lock(&inode->ei_quota_lock); - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -#else - -static void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - unsigned sectors, - bool check_enospc) -{ - return 0; -} - -#endif - -/* i_size updates: */ - -struct inode_new_size { - loff_t new_size; - u64 now; - unsigned fields; -}; - -static int inode_set_size(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_new_size *s = p; - - bi->bi_size = s->new_size; - if (s->fields & ATTR_ATIME) - bi->bi_atime = s->now; - if (s->fields & ATTR_MTIME) - bi->bi_mtime = s->now; - if (s->fields & ATTR_CTIME) - bi->bi_ctime = s->now; - - return 0; -} - -int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) -{ - struct inode_new_size s = { - .new_size = new_size, - .now = bch2_current_time(c), - .fields = fields, - }; - - return bch2_write_inode(c, inode, inode_set_size, &s, fields); -} - -static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, - "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); - inode->v.i_blocks += sectors; - -#ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && - !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && - sectors > 0) { - BUG_ON(sectors > quota_res->sectors); - BUG_ON(sectors > inode->ei_quota_reserved); - - quota_res->sectors -= sectors; - inode->ei_quota_reserved -= sectors; - } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); - } -#endif -} - -static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (sectors) { - mutex_lock(&inode->ei_quota_lock); - __i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); - } -} - -/* page state: */ - -/* stored in page->private: */ - -#define BCH_FOLIO_SECTOR_STATE() \ - x(unallocated) \ - x(reserved) \ - x(dirty) \ - x(dirty_reserved) \ - x(allocated) - -enum bch_folio_sector_state { -#define x(n) SECTOR_##n, - BCH_FOLIO_SECTOR_STATE() -#undef x -}; - -static const char * const bch2_folio_sector_states[] = { -#define x(n) #n, - BCH_FOLIO_SECTOR_STATE() -#undef x - NULL -}; - -static inline enum bch_folio_sector_state -folio_sector_dirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_dirty; - case SECTOR_reserved: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_undirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_dirty: - return SECTOR_unallocated; - case SECTOR_dirty_reserved: - return SECTOR_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_reserve(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_reserved; - case SECTOR_dirty: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -struct bch_folio_sector { - /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; -}; - -struct bch_folio { - spinlock_t lock; - atomic_t write_count; - /* - * Is the sector state up to date with the btree? - * (Not the data itself) - */ - bool uptodate; - struct bch_folio_sector s[]; -}; - -static inline void folio_sector_set(struct folio *folio, - struct bch_folio *s, - unsigned i, unsigned n) -{ - s->s[i].state = n; -} - -/* file offset (to folio offset) to bch_folio_sector index */ -static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -{ - u64 f_offset = pos - folio_pos(folio); - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); - return f_offset >> SECTOR_SHIFT; -} - -static inline struct bch_folio *__bch2_folio(struct folio *folio) -{ - return folio_has_private(folio) - ? (struct bch_folio *) folio_get_private(folio) - : NULL; -} - -static inline struct bch_folio *bch2_folio(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - - return __bch2_folio(folio); -} - -/* for newly allocated folios: */ -static void __bch2_folio_release(struct folio *folio) -{ - kfree(folio_detach_private(folio)); -} - -static void bch2_folio_release(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - __bch2_folio_release(folio); -} - -/* for newly allocated folios: */ -static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); - if (!s) - return NULL; - - spin_lock_init(&s->lock); - folio_attach_private(folio, s); - return s; -} - -static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -} - -static unsigned bkey_to_sector_state(struct bkey_s_c k) -{ - if (bkey_extent_is_reservation(k)) - return SECTOR_reserved; - if (bkey_extent_is_allocation(k.k)) - return SECTOR_allocated; - return SECTOR_unallocated; -} - -static void __bch2_folio_set(struct folio *folio, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - BUG_ON(pg_offset >= sectors); - BUG_ON(pg_offset + pg_len > sectors); - - spin_lock(&s->lock); - - for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - folio_sector_set(folio, s, i, state); - } - - if (i == sectors) - s->uptodate = true; - - spin_unlock(&s->lock); -} - -/* - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the - * extents btree: - */ -static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **folios, unsigned nr_folios) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_folio *s; - u64 offset = folio_sector(folios[0]); - unsigned folio_idx; - u32 snapshot; - bool need_set = false; - int ret; - - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); - if (!s) - return -ENOMEM; - - need_set |= !s->uptodate; - } - - if (!need_set) - return 0; - - folio_idx = 0; - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = folios[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_exit(&trans); - - return ret; -} - -static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct folio_vec fv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - bio_for_each_folio(fv, bio, iter) - __bch2_folio_set(fv.fv_folio, - fv.fv_offset >> 9, - fv.fv_len >> 9, - nr_ptrs, state); -} - -static void mark_pagecache_unallocated(struct bch_inode_info *inode, - u64 start, u64 end) -{ - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } -} - -static void mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - s64 i_sectors_delta = 0; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { - i_sectors_delta -= s->s[j].state == SECTOR_dirty; - folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); - } - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - i_sectors_acct(c, inode, NULL, i_sectors_delta); -} - -static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -{ - /* XXX: this should not be open coded */ - return inode->ei_inode.bi_data_replicas - ? inode->ei_inode.bi_data_replicas - 1 - : c->opts.data_replicas; -} - -static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, - unsigned nr_replicas) -{ - return max(0, (int) nr_replicas - - s->nr_replicas - - s->replicas_reserved); -} - -static int bch2_get_folio_disk_reservation(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, bool check_enospc) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned nr_replicas = inode_nr_replicas(c, inode); - struct disk_reservation disk_res = { 0 }; - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - for (i = 0; i < sectors; i++) - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); - - if (!disk_res_sectors) - return 0; - - ret = bch2_disk_reservation_get(c, &disk_res, - disk_res_sectors, 1, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); - if (unlikely(ret)) - return ret; - - for (i = 0; i < sectors; i++) - s->s[i].replicas_reserved += - sectors_to_reserve(&s->s[i], nr_replicas); - - return 0; -} - -struct bch2_folio_reservation { - struct disk_reservation disk; - struct quota_res quota; -}; - -static void bch2_folio_reservation_init(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - memset(res, 0, sizeof(*res)); - - res->disk.nr_replicas = inode_nr_replicas(c, inode); -} - -static void bch2_folio_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - bch2_disk_reservation_put(c, &res->disk); - bch2_quota_reservation_put(c, inode, &res->quota); -} - -static int bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned i, disk_sectors = 0, quota_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - BUG_ON(!s->uptodate); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - - if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); - if (unlikely(ret)) - return ret; - } - - if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, true); - if (unlikely(ret)) { - struct disk_reservation tmp = { - .sectors = disk_sectors - }; - - bch2_disk_reservation_put(c, &tmp); - res->disk.sectors -= disk_sectors; - return ret; - } - } - - return 0; -} - -static void bch2_clear_folio_bits(struct folio *folio) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_folio *s = bch2_folio(folio); - struct disk_reservation disk_res = { 0 }; - int i, sectors = folio_sectors(folio), dirty_sectors = 0; - - if (!s) - return; - - EBUG_ON(!folio_test_locked(folio)); - EBUG_ON(folio_test_writeback(folio)); - - for (i = 0; i < sectors; i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - dirty_sectors -= s->s[i].state == SECTOR_dirty; - folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); - } - - bch2_disk_reservation_put(c, &disk_res); - - i_sectors_acct(c, inode, NULL, dirty_sectors); - - bch2_folio_release(folio); -} - -static void bch2_set_folio_dirty(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, dirty_sectors = 0; - - WARN_ON((u64) folio_pos(folio) + offset + len > - round_up((u64) i_size_read(&inode->v), block_bytes(c))); - - BUG_ON(!s->uptodate); - - spin_lock(&s->lock); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); - - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; - - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); - - i_sectors_acct(c, inode, &res->quota, dirty_sectors); - - if (!folio_test_dirty(folio)) - filemap_dirty_folio(inode->v.i_mapping, folio); -} - -vm_fault_t bch2_page_fault(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct address_space *fdm = faults_disabled_mapping(); - struct bch_inode_info *inode = file_bch_inode(file); - vm_fault_t ret; - - if (fdm == mapping) - return VM_FAULT_SIGBUS; - - /* Lock ordering: */ - if (fdm > mapping) { - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - - if (bch2_pagecache_add_tryget(inode)) - goto got_lock; - - bch2_pagecache_block_put(fdm_host); - - bch2_pagecache_add_get(inode); - bch2_pagecache_add_put(inode); - - bch2_pagecache_block_get(fdm_host); - - /* Signal that lock has been dropped: */ - set_fdm_dropped_locks(); - return VM_FAULT_SIGBUS; - } - - bch2_pagecache_add_get(inode); -got_lock: - ret = filemap_fault(vmf); - bch2_pagecache_add_put(inode); - - return ret; -} - -vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -{ - struct folio *folio = page_folio(vmf->page); - struct file *file = vmf->vma->vm_file; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - unsigned len; - loff_t isize; - vm_fault_t ret; - - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); - file_update_time(file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * write_invalidate_inode_pages_range() - can drop this if/when we get - * a write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - bch2_pagecache_add_get(inode); - - folio_lock(folio); - isize = i_size_read(&inode->v); - - if (folio->mapping != mapping || folio_pos(folio) >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - - bch2_set_folio_dirty(c, inode, folio, &res, 0, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); - ret = VM_FAULT_LOCKED; -out: - bch2_pagecache_add_put(inode); - sb_end_pagefault(inode->v.i_sb); - - return ret; -} - -void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -{ - if (offset || length < folio_size(folio)) - return; - - bch2_clear_folio_bits(folio); -} - -bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -{ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) - return false; - - bch2_clear_folio_bits(folio); - return true; -} - -/* readpage(s): */ - -static void bch2_readpages_end_io(struct bio *bio) -{ - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } - - bio_put(bio); -} - -struct readpages_iter { - struct address_space *mapping; - unsigned idx; - folios folios; -}; - -static int readpages_iter_init(struct readpages_iter *iter, - struct readahead_control *ractl) -{ - struct folio **fi; - int ret; - - memset(iter, 0, sizeof(*iter)); - - iter->mapping = ractl->mapping; - - ret = filemap_get_contig_folios_d(iter->mapping, - ractl->_index << PAGE_SHIFT, - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, - 0, mapping_gfp_mask(iter->mapping), - &iter->folios); - if (ret) - return ret; - - darray_for_each(iter->folios, fi) { - ractl->_nr_pages -= 1U << folio_order(*fi); - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); - folio_put(*fi); - folio_put(*fi); - } - - return 0; -} - -static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -{ - if (iter->idx >= iter->folios.nr) - return NULL; - return iter->folios.data[iter->idx]; -} - -static inline void readpage_iter_advance(struct readpages_iter *iter) -{ - iter->idx++; -} - -static bool extent_partial_reads_expensive(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc.csum_type || crc.compression_type) - return true; - return false; -} - -static int readpage_bio_extend(struct btree_trans *trans, - struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) -{ - /* Don't hold btree locks while allocating memory: */ - bch2_trans_unlock(trans); - - while (bio_sectors(bio) < sectors_this_extent && - bio->bi_vcnt < bio->bi_max_vecs) { - struct folio *folio = readpage_iter_peek(iter); - int ret; - - if (folio) { - readpage_iter_advance(iter); - } else { - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - - if (!get_more) - break; - - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); - if (!folio) - break; - - if (!__bch2_folio_create(folio, GFP_KERNEL)) { - folio_put(folio); - break; - } - - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); - if (ret) { - __bch2_folio_release(folio); - folio_put(folio); - break; - } - - folio_put(folio); - } - - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); - } - - return bch2_trans_relock(trans); -} - -static void bchfs_read(struct btree_trans *trans, - struct bch_read_bio *rbio, - subvol_inum inum, - struct readpages_iter *readpages_iter) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; - u32 snapshot; - int ret = 0; - - rbio->c = c; - rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); - while (1) { - struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); - if (ret) - break; - - bch2_btree_iter_set_pos(&iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - sectors = min(sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); - if (ret) - break; - } - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - - if (flags & BCH_READ_LAST_FRAGMENT) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) - break; - } -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } - - bch2_bkey_buf_exit(&sk, c); -} - -void bch2_readahead(struct readahead_control *ractl) -{ - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct btree_trans trans; - struct folio *folio; - struct readpages_iter readpages_iter; - int ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - ret = readpages_iter_init(&readpages_iter, ractl); - BUG_ON(ret); - - bch2_trans_init(&trans, c, 0, 0); - - bch2_pagecache_add_get(inode); - - while ((folio = readpage_iter_peek(&readpages_iter))) { - unsigned n = min_t(unsigned, - readpages_iter.folios.nr - - readpages_iter.idx, - BIO_MAX_VECS); - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), - opts); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(&trans, rbio, inode_inum(inode), - &readpages_iter); - bch2_trans_unlock(&trans); - } - - bch2_pagecache_add_put(inode); - - bch2_trans_exit(&trans); - darray_exit(&readpages_iter.folios); -} - -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - struct btree_trans trans; - - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_init(&trans, c, 0, 0); - bchfs_read(&trans, rbio, inum, NULL); - bch2_trans_exit(&trans); -} - -static void bch2_read_single_folio_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int bch2_read_single_folio(struct folio *folio, - struct address_space *mapping) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); - rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - folio_mark_uptodate(folio); - return 0; -} - -int bch2_read_folio(struct file *file, struct folio *folio) -{ - int ret; - - ret = bch2_read_single_folio(folio, folio->mapping); - folio_unlock(folio); - return bch2_err_class(ret); -} - -/* writepages: */ - -struct bch_writepage_state { - struct bch_writepage_io *io; - struct bch_io_opts opts; - struct bch_folio_sector *tmp; - unsigned tmp_sectors; -}; - -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - -static void bch2_writepage_io_done(struct bch_write_op *op) -{ - struct bch_writepage_io *io = - container_of(op, struct bch_writepage_io, op); - struct bch_fs *c = io->op.c; - struct bio *bio = &io->op.wbio.bio; - struct folio_iter fi; - unsigned i; - - if (io->op.error) { - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - folio_set_error(fi.folio); - mapping_set_error(fi.folio->mapping, -EIO); - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - */ - WARN_ON_ONCE(io->op.i_sectors_delta > 0); - - /* - * (error (due to going RO) halfway through a page can screw that up - * slightly) - * XXX wtf? - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); - */ - - /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: - */ - i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s = __bch2_folio(fi.folio); - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(fi.folio); - } - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch2_writepage_io_alloc(struct bch_fs *c, - struct writeback_control *wbc, - struct bch_writepage_state *w, - struct bch_inode_info *inode, - u64 sector, - unsigned nr_replicas) -{ - struct bch_write_op *op; - - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, - REQ_OP_WRITE, - GFP_KERNEL, - &c->writepage_bioset), - struct bch_writepage_io, op.wbio.bio); - - w->io->inode = inode; - op = &w->io->op; - bch2_write_op_init(op, c, w->opts); - op->target = w->opts.foreground_target; - op->nr_replicas = nr_replicas; - op->res.nr_replicas = nr_replicas; - op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; - op->pos = POS(inode->v.i_ino, sector); - op->end_io = bch2_writepage_io_done; - op->devs_need_flush = &inode->ei_devs_need_flush; - op->wbio.bio.bi_iter.bi_sector = sector; - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -} - -static int __bch2_writepage(struct folio *folio, - struct writeback_control *wbc, - void *data) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_writepage_state *w = data; - struct bch_folio *s; - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; - loff_t i_size = i_size_read(&inode->v); - int ret; - - EBUG_ON(!folio_test_uptodate(folio)); - - /* Is the folio fully inside i_size? */ - if (folio_end_pos(folio) <= i_size) - goto do_io; - - /* Is the folio fully outside i_size? (truncate in progress) */ - if (folio_pos(folio) >= i_size) { - folio_unlock(folio); - return 0; - } - - /* - * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the folio size. For a file that is not a multiple of - * the folio size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - folio_zero_segment(folio, - i_size - folio_pos(folio), - folio_size(folio)); -do_io: - f_sectors = folio_sectors(folio); - s = bch2_folio(folio); - - if (f_sectors > w->tmp_sectors) { - kfree(w->tmp); - w->tmp = kzalloc(sizeof(struct bch_folio_sector) * - f_sectors, __GFP_NOFAIL); - w->tmp_sectors = f_sectors; - } - - /* - * Things get really hairy with errors during writeback: - */ - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); - BUG_ON(ret); - - /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; - - s->s[i].replicas_reserved = 0; - folio_sector_set(folio, s, i, SECTOR_allocated); - } - spin_unlock(&s->lock); - - BUG_ON(atomic_read(&s->write_count)); - atomic_set(&s->write_count, 1); - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - - folio_unlock(folio); - - offset = 0; - while (1) { - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; - u64 sector; - - while (offset < f_sectors && - w->tmp[offset].state < SECTOR_dirty) - offset++; - - if (offset == f_sectors) - break; - - while (offset + sectors < f_sectors && - w->tmp[offset + sectors].state >= SECTOR_dirty) { - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; - sectors++; - } - BUG_ON(!sectors); - - sector = folio_sector(folio) + offset; - - if (w->io && - (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio, sectors << 9) || - w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_VECS * PAGE_SIZE) || - bio_end_sector(&w->io->op.wbio.bio) != sector)) - bch2_writepage_do_io(w); - - if (!w->io) - bch2_writepage_io_alloc(c, wbc, w, inode, sector, - nr_replicas_this_write); - - atomic_inc(&s->write_count); - - BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; - - offset += sectors; - } - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(folio); - - return 0; -} - -int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); - return bch2_err_class(ret); -} - -/* buffered writes: */ - -int bch2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res; - struct folio *folio; - unsigned offset; - int ret = -ENOMEM; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return -ENOMEM; - - bch2_folio_reservation_init(c, inode, res); - *fsdata = res; - - bch2_pagecache_add_get(inode); - - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) - goto err_unlock; - - if (folio_test_uptodate(folio)) - goto out; - - offset = pos - folio_pos(folio); - len = min_t(size_t, len, folio_end_pos(folio) - pos); - - /* If we're writing entire folio, don't need to read it in first: */ - if (!offset && len == folio_size(folio)) - goto out; - - if (!offset && pos + len >= inode->v.i_size) { - folio_zero_segment(folio, len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } - - if (folio_pos(folio) >= inode->v.i_size) { - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } -readpage: - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto err; -out: - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto err; - - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); - if (ret) { - if (!folio_test_uptodate(folio)) { - /* - * If the folio hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the folio is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *pagep = &folio->page; - return 0; -err: - folio_unlock(folio); - folio_put(folio); - *pagep = NULL; -err_unlock: - bch2_pagecache_add_put(inode); - kfree(res); - *fsdata = NULL; - return bch2_err_class(ret); -} - -int bch2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); - unsigned offset = pos - folio_pos(folio); - - lockdep_assert_held(&inode->v.i_rwsem); - BUG_ON(offset + copied > folio_size(folio)); - - if (unlikely(copied < len && !folio_test_uptodate(folio))) { - /* - * The folio needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - copied = 0; - } - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - - if (copied) { - if (!folio_test_uptodate(folio)) - folio_mark_uptodate(folio); - - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); - - inode->ei_last_dirtied = (unsigned long) current; - } - - folio_unlock(folio); - folio_put(folio); - bch2_pagecache_add_put(inode); - - bch2_folio_reservation_put(c, inode, res); - kfree(res); - - return copied; -} - -static noinline void folios_trunc(folios *folios, struct folio **fi) -{ - while (folios->data + folios->nr > fi) { - struct folio *f = darray_pop(folios); - - folio_unlock(f); - folio_put(f); - } -} - -static int __bch2_buffered_write(struct bch_inode_info *inode, - struct address_space *mapping, - struct iov_iter *iter, - loff_t pos, unsigned len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - folios folios; - struct folio **fi, *f; - unsigned copied = 0, f_offset; - u64 end = pos + len, f_pos; - loff_t last_folio_pos = inode->v.i_size; - int ret = 0; - - BUG_ON(!len); - - bch2_folio_reservation_init(c, inode, &res); - darray_init(&folios); - - ret = filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &folios); - if (ret) - goto out; - - BUG_ON(!folios.nr); - - f = darray_first(folios); - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - f = darray_last(folios); - end = min(end, folio_end_pos(f)); - last_folio_pos = folio_pos(f); - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { - if (end >= inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - } else { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - } - - ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); - if (ret) - goto out; - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&folios, fi); - if (!folios.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(folios))); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (mapping_writably_mapped(mapping)) - darray_for_each(folios, fi) - flush_dcache_folio(*fi); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); - - if (!f_copied) { - folios_trunc(&folios, fi); - break; - } - - if (!folio_test_uptodate(f) && - f_copied != folio_size(f) && - pos + copied + f_copied < inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&folios, fi); - break; - } - - flush_dcache_folio(f); - copied += f_copied; - - if (f_copied != f_len) { - folios_trunc(&folios, fi + 1); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (!copied) - goto out; - - end = pos + copied; - - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - - if (!folio_test_uptodate(f)) - folio_mark_uptodate(f); - - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - inode->ei_last_dirtied = (unsigned long) current; -out: - darray_for_each(folios, fi) { - folio_unlock(*fi); - folio_put(*fi); - } - - /* - * If the last folio added to the mapping starts beyond current EOF, we - * performed a short write but left around at least one post-EOF folio. - * Clean up the mapping before we return. - */ - if (last_folio_pos >= inode->v.i_size) - truncate_pagecache(&inode->v, inode->v.i_size); - - darray_exit(&folios); - bch2_folio_reservation_put(c, inode, &res); - - return copied ?: ret; -} - -static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; - - bch2_pagecache_add_get(inode); - - do { - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = iov_iter_count(iter); -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE - offset); - - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - ret = -EFAULT; - break; - } - } - - if (unlikely(fatal_signal_pending(current))) { - ret = -EINTR; - break; - } - - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); - if (unlikely(ret < 0)) - break; - - cond_resched(); - - if (unlikely(ret == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(iter)); - goto again; - } - pos += ret; - written += ret; - ret = 0; - - balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); - - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -/* O_DIRECT reads */ - -static void bio_check_or_release(struct bio *bio, bool check_dirty) -{ - if (check_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static void bch2_dio_read_complete(struct closure *cl) -{ - struct dio_read *dio = container_of(cl, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret); - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -} - -static void bch2_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_status) - dio->ret = blk_status_to_errno(bio->bi_status); - - closure_put(&dio->cl); -} - -static void bch2_direct_IO_read_split_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - bch2_direct_IO_read_endio(bio); - bio_check_or_release(bio, should_dirty); -} - -static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - size_t shorten; - ssize_t ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - if ((offset|iter->count) & (block_bytes(c) - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - - if (!ret) - return ret; - - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); - iter->count -= shorten; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->dio_read_bioset); - - bio->bi_end_io = bch2_direct_IO_read_endio; - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - } - - dio->req = req; - dio->ret = ret; - /* - * This is one of the sketchier things I've encountered: we have to skip - * the dirtying of requests that are internal from the kernel (i.e. from - * loopback), because we'll deadlock on page_lock. - */ - dio->should_dirty = iter_is_iovec(iter); - - goto start; - while (iter->count) { - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; -start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - - if (dio->should_dirty) - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); - } - - iter->count += shorten; - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(iter); - ssize_t ret; - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_DIRECT) { - struct blk_plug plug; - - if (unlikely(mapping->nrpages)) { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; - } - - file_accessed(file); - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(iocb, iter); - blk_finish_plug(&plug); - - if (ret >= 0) - iocb->ki_pos += ret; - } else { - bch2_pagecache_add_get(inode); - ret = generic_file_read_iter(iocb, iter); - bch2_pagecache_add_put(inode); - } -out: - return bch2_err_class(ret); -} - -/* O_DIRECT writes */ - -static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - u64 end = offset + size; - u32 snapshot; - bool ret = true; - int err; - - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (err) - goto err; - - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) - break; - - if (k.k->p.snapshot != snapshot || - nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); -err: - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_exit(&trans); - - return err ? false : ret; -} - -static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - return bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *); -static __always_inline long bch2_dio_write_done(struct dio_write *dio); - -/* - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the - * caller's stack, we're not guaranteed that it will live for the duration of - * the IO: - */ -static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -{ - struct iovec *iov = dio->inline_vecs; - - /* - * iov_iter has a single embedded iovec - nothing to do: - */ - if (iter_is_ubuf(&dio->iter)) - return 0; - - /* - * We don't currently handle non-iovec iov_iters here - return an error, - * and we'll fall back to doing the IO synchronously: - */ - if (!iter_is_iovec(&dio->iter)) - return -1; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) - return -ENOMEM; - - dio->free_iov = true; - } - - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.__iov = iov; - return 0; -} - -static void bch2_dio_write_flush_done(struct closure *cl) -{ - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); - struct bch_fs *c = dio->op.c; - - closure_debug_destroy(cl); - - dio->op.error = bch2_journal_error(&c->journal); - - bch2_dio_write_done(dio); -} - -static noinline void bch2_dio_write_flush(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_unpacked inode; - int ret; - - dio->flush = 0; - - closure_init(&dio->op.cl, NULL); - - if (!dio->op.error) { - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) { - dio->op.error = ret; - } else { - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); - } - } - - if (dio->sync) { - closure_sync(&dio->op.cl); - closure_debug_destroy(&dio->op.cl); - } else { - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); - } -} - -static __always_inline long bch2_dio_write_done(struct dio_write *dio) -{ - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - bool sync = dio->sync; - long ret; - - if (unlikely(dio->flush)) { - bch2_dio_write_flush(dio); - if (!sync) - return -EIOCBQUEUED; - } - - bch2_pagecache_block_put(inode); - - if (dio->free_iov) - kfree(dio->iter.__iov); - - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; - } - return ret; -} - -static __always_inline void bch2_dio_write_end(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - - if (dio->extending) { - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); - } - - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); - __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); - __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); - } - - bio_release_pages(bio, false); - - if (unlikely(dio->op.error)) - set_bit(EI_INODE_ERROR, &inode->ei_flags); -} - -static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct address_space *mapping = dio->mapping; - struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; - struct bio *bio = &dio->op.wbio.bio; - unsigned unaligned, iter_count; - bool sync = dio->sync, dropped_locks; - long ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - while (1) { - iter_count = dio->iter.count; - - EBUG_ON(current->faults_disabled_mapping); - current->faults_disabled_mapping = mapping; - - ret = bio_iov_iter_get_pages(bio, &dio->iter); - - dropped_locks = fdm_dropped_locks(); - - current->faults_disabled_mapping = NULL; - - /* - * If the fault handler returned an error but also signalled - * that it dropped & retook ei_pagecache_lock, we just need to - * re-shoot down the page cache and retry: - */ - if (dropped_locks && ret) - ret = 0; - - if (unlikely(ret < 0)) - goto err; - - if (unlikely(dropped_locks)) { - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter_count - 1); - if (unlikely(ret)) - goto err; - - if (!bio->bi_iter.bi_size) - continue; - } - - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); - bio->bi_iter.bi_size -= unaligned; - iov_iter_revert(&dio->iter, unaligned); - - if (!bio->bi_iter.bi_size) { - /* - * bio_iov_iter_get_pages was only able to get < - * blocksize worth of pages: - */ - ret = -EFAULT; - goto err; - } - - bch2_write_op_init(&dio->op, c, opts); - dio->op.end_io = sync - ? NULL - : bch2_dio_write_loop_async; - dio->op.target = dio->op.opts.foreground_target; - dio->op.write_point = writepoint_hashed((unsigned long) current); - dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_subvol; - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); - if (unlikely(ret)) - goto err; - - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), - dio->op.opts.data_replicas, 0); - if (unlikely(ret) && - !bch2_dio_write_check_allocated(dio)) - goto err; - - task_io_account_write(bio->bi_iter.bi_size); - - if (unlikely(dio->iter.count) && - !dio->sync && - !dio->loop && - bch2_dio_write_copy_iov(dio)) - dio->sync = sync = true; - - dio->loop = true; - closure_call(&dio->op.cl, bch2_write, NULL, NULL); - - if (!sync) - return -EIOCBQUEUED; - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - break; - - bio_reset(bio, NULL, REQ_OP_WRITE); + bio->bio.bi_end_io = nocow_flush_endio; + closure_bio_submit(&bio->bio, cl); } -out: - return bch2_dio_write_done(dio); -err: - dio->op.error = ret; - - bio_release_pages(bio, false); - - bch2_quota_reservation_put(c, inode, &dio->quota_res); - goto out; } -static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +static int bch2_inode_flush_nocow_writes(struct bch_fs *c, + struct bch_inode_info *inode) { - struct mm_struct *mm = dio->mm; + struct closure cl; - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + closure_init_stack(&cl); + bch2_inode_flush_nocow_writes_async(c, inode, &cl); + closure_sync(&cl); - if (mm) - kthread_use_mm(mm); - bch2_dio_write_loop(dio); - if (mm) - kthread_unuse_mm(mm); + return 0; } -static void bch2_dio_write_loop_async(struct bch_write_op *op) -{ - struct dio_write *dio = container_of(op, struct dio_write, op); - - bch2_dio_write_end(dio); +/* i_size updates: */ - if (likely(!dio->iter.count) || dio->op.error) - bch2_dio_write_done(dio); - else - bch2_dio_write_continue(dio); -} +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; -static noinline -ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +static int inode_set_size(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct file *file = req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct dio_write *dio; - struct bio *bio; - bool locked = true, extending; - ssize_t ret; - - prefetch(&c->opts); - prefetch((void *) &c->opts + 64); - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - - inode_lock(&inode->v); - - ret = generic_write_checks(req, iter); - if (unlikely(ret <= 0)) - goto err; - - ret = file_remove_privs(file); - if (unlikely(ret)) - goto err; - - ret = file_update_time(file); - if (unlikely(ret)) - goto err; - - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; - - inode_dio_begin(&inode->v); - bch2_pagecache_block_get(inode); - - extending = req->ki_pos + iter->count > inode->v.i_size; - if (!extending) { - inode_unlock(&inode->v); - locked = false; - } + struct inode_new_size *s = p; - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, - GFP_KERNEL, - &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, op.wbio.bio); - dio->req = req; - dio->mapping = mapping; - dio->inode = inode; - dio->mm = current->mm; - dio->loop = false; - dio->extending = extending; - dio->sync = is_sync_kiocb(req) || extending; - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->free_iov = false; - dio->quota_res.sectors = 0; - dio->written = 0; - dio->iter = *iter; - dio->op.c = c; - - if (unlikely(mapping->nrpages)) { - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; - } + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; - ret = bch2_dio_write_loop(dio); -err: - if (locked) - inode_unlock(&inode->v); - return ret; -err_put_bio: - bch2_pagecache_block_put(inode); - bio_put(bio); - inode_dio_end(&inode->v); - goto err; + return 0; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(&inode->v); - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; - ret = file_remove_privs(file); - if (ret) - goto unlock; + return bch2_write_inode(c, inode, inode_set_size, &s, fields); +} - ret = file_update_time(file); - if (ret) - goto unlock; +void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + inode->v.i_blocks += sectors; - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); - current->backing_dev_info = NULL; +#ifdef CONFIG_BCACHEFS_QUOTA + if (quota_res && + !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && + sectors > 0) { + BUG_ON(sectors > quota_res->sectors); + BUG_ON(sectors > inode->ei_quota_reserved); - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: - return bch2_err_class(ret); + quota_res->sectors -= sectors; + inode->ei_quota_reserved -= sectors; + } else { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); + } +#endif } /* fsync: */ @@ -2814,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { ret = 1; break; } start = iter.pos; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -2848,8 +239,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_folio *s; - unsigned start_offset = start & (PAGE_SIZE - 1); - unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned start_offset; + unsigned end_offset; unsigned i; struct folio *folio; s64 i_sectors_delta = 0; @@ -2870,7 +261,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (unlikely(IS_ERR_OR_NULL(folio))) { + if (IS_ERR_OR_NULL(folio)) { ret = -ENOMEM; goto out; } @@ -2911,10 +302,10 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, s->s[i].nr_replicas = 0; i_sectors_delta -= s->s[i].state == SECTOR_dirty; - folio_sector_set(folio, s, i, SECTOR_unallocated); + bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); } - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); /* * Caller needs to know whether this folio will be written out by @@ -2998,31 +389,12 @@ static int bch2_extend(struct mnt_idmap *idmap, return bch2_setattr_nonsize(idmap, inode, iattr); } -static int bch2_truncate_finish_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; - return 0; -} - -static int bch2_truncate_start_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - u64 *new_i_size = p; - - bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; - bi->bi_size = *new_i_size; - return 0; -} - -int bch2_truncate(struct mnt_idmap *idmap, +int bchfs_truncate(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_inode_unpacked inode_u; - u64 new_i_size = iattr->ia_size; s64 i_sectors_delta = 0; int ret = 0; @@ -3071,6 +443,8 @@ int bch2_truncate(struct mnt_idmap *idmap, if (unlikely(ret < 0)) goto err; + truncate_setsize(&inode->v, iattr->ia_size); + /* * When extending, we're going to write the new i_size to disk * immediately so we need to flush anything above the current on disk @@ -3092,32 +466,22 @@ int bch2_truncate(struct mnt_idmap *idmap, if (ret) goto err; - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, - &new_i_size, 0); - mutex_unlock(&inode->ei_update_lock); + ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - if (unlikely(ret)) + if (unlikely(ret)) { + /* + * If we error here, VFS caches are now inconsistent with btree + */ + set_bit(EI_INODE_ERROR, &inode->ei_flags); goto err; - - truncate_setsize(&inode->v, iattr->ia_size); - - ret = bch2_fpunch(c, inode_inum(inode), - round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); + } bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && !bch2_journal_error(&c->journal), c, "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); - if (unlikely(ret)) - goto err; - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); - mutex_unlock(&inode->ei_update_lock); ret = bch2_setattr_nonsize(idmap, inode, iattr); err: @@ -3127,7 +491,8 @@ err: /* fallocate: */ -static int inode_update_times_fn(struct bch_inode_info *inode, +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -3159,7 +524,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len ret = bch2_fpunch(c, inode_inum(inode), block_start >> 9, block_end >> 9, &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); @@ -3181,175 +546,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bkey_buf copy; - struct btree_trans trans; - struct btree_iter src, dst, del; - loff_t shift, new_size; - u64 src_start; + s64 i_sectors_delta = 0; int ret = 0; if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; if (insert) { - if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) - return -EFBIG; - if (offset >= inode->v.i_size) return -EINVAL; - - src_start = U64_MAX; - shift = len; } else { if (offset + len >= inode->v.i_size) return -EINVAL; - - src_start = offset + len; - shift = -len; } - new_size = inode->v.i_size + shift; - - ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) return ret; - if (insert) { - i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - } else { - s64 i_sectors_delta = 0; - - ret = bch2_fpunch(c, inode_inum(inode), - offset >> 9, (offset + len) >> 9, - &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); - - if (ret) - return ret; - } - - bch2_bkey_buf_init(©); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, - POS(inode->v.i_ino, src_start >> 9), - BTREE_ITER_INTENT); - bch2_trans_copy_iter(&dst, &src); - bch2_trans_copy_iter(&del, &src); - - while (ret == 0 || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - struct bkey_s_c k; - struct bpos next_pos; - struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); - struct bpos atomic_end; - unsigned trigger_flags = 0; - u32 snapshot; - - bch2_trans_begin(&trans); - - ret = bch2_subvolume_get_snapshot(&trans, - inode->ei_subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(&src, snapshot); - bch2_btree_iter_set_snapshot(&dst, snapshot); - bch2_btree_iter_set_snapshot(&del, snapshot); - - bch2_trans_begin(&trans); - - k = insert - ? bch2_btree_iter_peek_prev(&src) - : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); - if ((ret = bkey_err(k))) - continue; - - if (!k.k || k.k->p.inode != inode->v.i_ino) - break; - - if (insert && - bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) - break; -reassemble: - bch2_bkey_buf_reassemble(©, c, k); - - if (insert && - bkey_lt(bkey_start_pos(k.k), move_pos)) - bch2_cut_front(move_pos, copy.k); - - copy.k->k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); - - ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); - if (ret) - continue; - - if (!bkey_eq(atomic_end, copy.k->k.p)) { - if (insert) { - move_pos = atomic_end; - move_pos.offset -= shift >> 9; - goto reassemble; - } else { - bch2_cut_back(atomic_end, copy.k); - } - } - - bkey_init(&delete.k); - delete.k.p = copy.k->k.p; - delete.k.size = copy.k->k.size; - delete.k.p.offset -= shift >> 9; - bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); - - next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - - if (copy.k->k.size != k.k->size) { - /* We might end up splitting compressed extents: */ - unsigned nr_ptrs = - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); - - ret = bch2_disk_reservation_get(c, &disk_res, - copy.k->k.size, nr_ptrs, - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - } - - ret = bch2_btree_iter_traverse(&del) ?: - bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: - bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: - bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(c, &disk_res); - - if (!ret) - bch2_btree_iter_set_pos(&src, next_pos); - } - bch2_trans_iter_exit(&trans, &del); - bch2_trans_iter_exit(&trans, &dst); - bch2_trans_iter_exit(&trans, &src); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(©, c); + if (insert) + i_size_write(&inode->v, inode->v.i_size + len); - if (ret) - return ret; + ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, + insert, &i_sectors_delta); + if (!ret && !insert) + i_size_write(&inode->v, inode->v.i_size - len); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - mutex_lock(&inode->ei_update_lock); - if (!insert) { - i_size_write(&inode->v, new_size); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - } else { - /* We need an inode update to update bi_journal_seq for fsync: */ - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); return ret; } @@ -3357,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); struct bch_io_opts opts; int ret = 0; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -3379,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 hole_start, hole_end; u32 snapshot; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto bkey_err; @@ -3410,11 +632,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, } if (!(mode & FALLOC_FL_ZERO_RANGE)) { + /* + * Lock ordering - can't be holding btree locks while + * blocking on a folio lock: + */ if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, true)) - ret = drop_locks_do(&trans, + ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, @@ -3437,16 +663,16 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, goto bkey_err; } - ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter, + ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, sectors, opts, &i_sectors_delta, writepoint_hashed((unsigned long) current)); if (ret) goto bkey_err; - i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(&trans, - (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + drop_locks_do(trans, + (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -3457,14 +683,14 @@ bkey_err: struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; - bch2_fpunch_at(&trans, &iter, inode_inum(inode), + bch2_fpunch_at(trans, &iter, inode_inum(inode), end_sector, &i_sectors_delta); - i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); bch2_quota_reservation_put(c, inode, "a_res); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -3570,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; u32 snapshot; u64 sectors = end - start; u64 pos = start; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inode->v.i_ino, pos, snapshot), 0); - while (!(ret = btree_trans_too_many_iters(&trans)) && + while (!(ret = btree_trans_too_many_iters(trans)) && (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && !(ret = bkey_err(k))) { if (bkey_extent_is_allocation(k.k)) { @@ -3601,17 +825,14 @@ retry: bch2_btree_iter_advance(&iter); } pos = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); - - if (ret) - return ret; + bch2_trans_put(trans); - return bch2_quota_reservation_add(c, inode, res, sectors, true); + return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, @@ -3653,7 +874,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, aligned_len = round_up((u64) len, block_bytes(c)); - ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, pos_dst, pos_dst + len - 1); if (ret) goto err; @@ -3665,7 +886,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, file_update_time(file_dst); - mark_pagecache_unallocated(src, pos_src >> 9, + bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); ret = bch2_remap_range(c, @@ -3681,7 +902,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, */ ret = min((u64) ret << 9, (u64) len); - i_sectors_acct(c, dst, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); spin_lock(&dst->v.i_lock); if (pos_dst + ret > dst->v.i_size) @@ -3700,73 +921,11 @@ err: /* fseek: */ -static int folio_data_offset(struct folio *folio, loff_t pos, - unsigned min_replicas) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - if (s) - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) - if (s->s[i].state >= SECTOR_dirty && - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) - return i << SECTOR_SHIFT; - - return -1; -} - -static loff_t bch2_seek_pagecache_data(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct folio_batch fbatch; - pgoff_t start_index = start_offset >> PAGE_SHIFT; - pgoff_t end_index = end_offset >> PAGE_SHIFT; - pgoff_t index = start_index; - unsigned i; - loff_t ret; - int offset; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(vinode->i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblock) { - folio_lock(folio); - } else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - return -EAGAIN; - } - - offset = folio_data_offset(folio, - max(folio_pos(folio), start_offset), - min_replicas); - if (offset >= 0) { - ret = clamp(folio_pos(folio) + offset, - start_offset, end_offset); - folio_unlock(folio); - folio_batch_release(&fbatch); - return ret; - } - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - return end_offset; -} - static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; subvol_inum inum = inode_inum(inode); @@ -3778,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), POS(inode->v.i_ino, U64_MAX), 0, k, ret) { @@ -3796,12 +955,12 @@ retry: } else if (k.k->p.offset >> 9 > isize) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; @@ -3815,93 +974,11 @@ err: return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static int folio_hole_offset(struct address_space *mapping, loff_t *offset, - unsigned min_replicas, bool nonblock) -{ - struct folio *folio; - struct bch_folio *s; - unsigned i, sectors; - bool ret = true; - - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, - !nonblock ? FGP_LOCK : 0, 0); - if (IS_ERR_OR_NULL(folio)) - return true; - - if (nonblock && !folio_trylock(folio)) { - folio_put(folio); - return -EAGAIN; - } - - s = bch2_folio(folio); - if (!s) - goto unlock; - - sectors = folio_sectors(folio); - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) - if (s->s[i].state < SECTOR_dirty || - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { - *offset = max(*offset, - folio_pos(folio) + (i << SECTOR_SHIFT)); - goto unlock; - } - - *offset = folio_end_pos(folio); - ret = false; -unlock: - folio_unlock(folio); - folio_put(folio); - return ret; -} - -static loff_t bch2_seek_pagecache_hole(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset; - - while (offset < end_offset && - !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) - ; - - return min(offset, end_offset); -} - -static int bch2_clamp_data_hole(struct inode *inode, - u64 *hole_start, - u64 *hole_end, - unsigned min_replicas, - bool nonblock) -{ - loff_t ret; - - ret = bch2_seek_pagecache_hole(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_start = ret; - - if (*hole_start == *hole_end) - return 0; - - ret = bch2_seek_pagecache_data(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_end = ret; - return 0; -} - static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; subvol_inum inum = inode_inum(inode); @@ -3913,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { @@ -3939,12 +1016,12 @@ retry: offset = max(offset, bkey_start_offset(k.k) << 9); } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; @@ -3981,28 +1058,10 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence) void bch2_fs_fsio_exit(struct bch_fs *c) { bioset_exit(&c->nocow_flush_bioset); - bioset_exit(&c->dio_write_bioset); - bioset_exit(&c->dio_read_bioset); - bioset_exit(&c->writepage_bioset); } int bch2_fs_fsio_init(struct bch_fs *c) { - if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_writepage_bioset_init; - - if (bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_read_bioset_init; - - if (bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_write_bioset_init; - if (bioset_init(&c->nocow_flush_bioset, 1, offsetof(struct nocow_flush, bio), 0)) return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index af90533..ca70346 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -5,32 +5,167 @@ #ifndef NO_BCACHEFS_FS #include "buckets.h" -#include "io_types.h" +#include "fs.h" +#include "io_write_types.h" +#include "quota.h" #include -struct quota_res; +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; + size_t fv_len; +}; + +static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) +{ + + struct folio *folio = page_folio(bv.bv_page); + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + + bv.bv_offset; + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); + + return (struct folio_vec) { + .fv_folio = folio, + .fv_offset = offset, + .fv_len = len, + }; +} + +static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, + struct bvec_iter iter) +{ + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); +} + +#define __bio_for_each_folio(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) + +/** + * bio_for_each_folio - iterate over folios within a bio + * + * Like other non-_all versions, this iterates over what bio->bi_iter currently + * points to. This version is for drivers, where the bio may have previously + * been split or cloned. + */ +#define bio_for_each_folio(bvl, bio, iter) \ + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) + +struct quota_res { + u64 sectors; +}; + +#ifdef CONFIG_BCACHEFS_QUOTA + +static inline void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + BUG_ON(res->sectors > inode->ei_quota_reserved); + + bch2_quota_acct(c, inode->ei_qid, Q_SPC, + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); + inode->ei_quota_reserved -= res->sectors; + res->sectors = 0; +} + +static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + if (res->sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_quota_reservation_put(c, inode, res); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + u64 sectors, + bool check_enospc) +{ + int ret; + + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + + mutex_lock(&inode->ei_quota_lock); + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); + if (likely(!ret)) { + inode->ei_quota_reserved += sectors; + res->sectors += sectors; + } + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} -int __must_check bch2_write_inode_size(struct bch_fs *, - struct bch_inode_info *, - loff_t, unsigned); +#else + +static inline void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} -int bch2_read_folio(struct file *, struct folio *); +static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} -int bch2_writepages(struct address_space *, struct writeback_control *); -void bch2_readahead(struct readahead_control *); +static inline int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + unsigned sectors, + bool check_enospc) +{ + return 0; +} -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); +#endif -ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); +void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, + struct quota_res *, s64); + +static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + if (sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, quota_res, sectors); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + +void bch2_inode_flush_nocow_writes_async(struct bch_fs *, + struct bch_inode_info *, struct closure *); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); int bch2_fsync(struct file *, loff_t, loff_t, int); -int bch2_truncate(struct mnt_idmap *, +int bchfs_truncate(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); @@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t bch2_llseek(struct file *, loff_t, int); -vm_fault_t bch2_page_fault(struct vm_fault *); -vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidate_folio(struct folio *, size_t, size_t); -bool bch2_release_folio(struct folio *, gfp_t); - void bch2_fs_fsio_exit(struct bch_fs *); int bch2_fs_fsio_init(struct bch_fs *); #else diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index dfa1bf7..d7c1b05 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -31,7 +31,8 @@ struct flags_set { bool projinherit; }; -static int bch2_inode_flags_set(struct bch_inode_info *inode, +static int bch2_inode_flags_set(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -44,13 +45,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, unsigned newflags = s->flags; unsigned oldflags = bi->bi_flags & s->mask; - if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && + if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; if (!S_ISREG(bi->bi_mode) && !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) + (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; if (s->set_projinherit) { @@ -121,10 +122,14 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - return copy_to_user(arg, &fa, sizeof(fa)); + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + + return 0; } -static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, +static int fssetxattr_inode_update_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -135,7 +140,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, bi->bi_project = s->projid; } - return bch2_inode_flags_set(inode, bi, p); + return bch2_inode_flags_set(trans, inode, bi, p); } static int bch2_ioc_fssetxattr(struct bch_fs *c, @@ -192,7 +197,8 @@ err: return ret; } -static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, +static int bch2_reinherit_attrs_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -312,8 +318,8 @@ err: return ret; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -434,36 +440,48 @@ err1: return error; } +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + down_write(&c->snapshot_create_lock); + long ret = __bch2_ioctl_subvolume_create(c, filp, arg); + up_write(&c->snapshot_create_lock); + + return ret; +} + static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { + struct filename *name; struct path path; struct inode *dir; + struct dentry *victim; int ret = 0; if (arg.flags) return -EINVAL; - ret = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.dst_ptr, - LOOKUP_FOLLOW, &path); - if (ret) - return ret; + name = getname((const char __user *)(unsigned long)arg.dst_ptr); + victim = filename_path_locked(arg.dirfd, name, &path); + putname(name); + if (IS_ERR(victim)) + return PTR_ERR(victim); - if (path.dentry->d_sb->s_fs_info != c) { + if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; } - dir = path.dentry->d_parent->d_inode; - - ret = __bch2_unlink(dir, path.dentry, true); - if (ret) - goto err; - - fsnotify_rmdir(dir, path.dentry); - d_delete(path.dentry); + dir = d_inode(path.dentry); + ret = __bch2_unlink(dir, victim, true); + if (!ret) { + fsnotify_rmdir(dir, victim); + d_delete(victim); + } + inode_unlock(dir); err: + dput(victim); path_put(&path); return ret; } diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h index f201980..d30f9bb 100644 --- a/libbcachefs/fs-ioctl.h +++ b/libbcachefs/fs-ioctl.h @@ -5,29 +5,29 @@ /* Inode flags: */ /* bcachefs inode flags -> vfs inode flags: */ -static const unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, +static const __maybe_unused unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +static const __maybe_unused unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, +static const __maybe_unused unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; }; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 8d2f388..f76d403 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -14,12 +14,16 @@ #include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" #include "journal.h" #include "keylist.h" #include "quota.h" +#include "snapshot.h" #include "super.h" #include "xattr.h" @@ -62,11 +66,11 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->v.i_mode = bi->bi_mode; if (fields & ATTR_ATIME) - inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) - inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); if (fields & ATTR_CTIME) - inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); + inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); inode->ei_inode = *bi; @@ -78,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c, inode_set_fn set, void *p, unsigned fields) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; - - bch2_trans_init(&trans, c, 0, 512); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT) ?: - (set ? set(inode, &inode_u, p) : 0) ?: - bch2_inode_write(&trans, &iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); + (set ? set(trans, inode, &inode_u, p) : 0) ?: + bch2_inode_write(trans, &iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(&trans, inode, &inode_u, fields); + bch2_inode_update_after_write(trans, inode, &inode_u, fields); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -110,7 +112,7 @@ retry: inode_inum(inode).subvol, inode_inum(inode).inum); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret < 0 ? ret : 0; } @@ -178,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; - struct btree_trans trans; + struct btree_trans *trans; struct bch_subvolume subvol; int ret; @@ -192,18 +194,18 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (!(inode->v.i_state & I_NEW)) return &inode->v; - bch2_trans_init(&trans, c, 8, 0); - ret = lockrestart_do(&trans, - bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + trans = bch2_trans_get(c); + ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); if (!ret) - bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); - bch2_trans_exit(&trans); + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + bch2_trans_put(trans); if (ret) { iget_failed(&inode->v); - return ERR_PTR(ret); + return ERR_PTR(bch2_err_class(ret)); } mutex_lock(&c->vfs_inodes_lock); @@ -222,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; @@ -252,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap, if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c, 8, - 2048 + (!(flags & BCH_CREATE_TMPFILE) - ? dentry->d_name.len : 0)); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_create_trans(&trans, + ret = bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, @@ -274,9 +274,9 @@ retry: inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(&trans, inum.subvol, true, + ret = bch2_subvolume_get(trans, inum.subvol, true, BTREE_ITER_WITH_UPDATES, &subvol) ?: - bch2_trans_commit(&trans, NULL, &journal_seq, 0); + bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -287,13 +287,13 @@ err_before_quota: } if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&dir->ei_update_lock); } bch2_iget5_set(&inode->v, &inum); - bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -333,7 +333,7 @@ err_before_quota: unlock_new_inode(&inode->v); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); err: posix_acl_release(default_acl); posix_acl_release(acl); @@ -342,7 +342,7 @@ err_trans: if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); - bch2_trans_exit(&trans); + bch2_trans_put(trans); make_bad_inode(&inode->v); iput(&inode->v); inode = ERR_PTR(ret); @@ -397,26 +397,25 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_info *dir, struct dentry *dentry) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c, 4, 1024); - ret = commit_do(&trans, NULL, NULL, 0, - bch2_link_trans(&trans, + ret = commit_do(trans, NULL, NULL, 0, + bch2_link_trans(trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); mutex_unlock(&inode->ei_update_lock); return ret; } @@ -447,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - bch2_trans_init(&trans, c, 4, 1024); - ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_unlink_trans(&trans, + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, deleting_snapshot)); if (unlikely(ret)) goto err; - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); if (inode_u.bi_subvol) { @@ -475,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, set_nlink(&inode->v, 0); } err: - bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_put(trans); return ret; } @@ -539,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u; - struct btree_trans trans; + struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode @@ -556,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, return ret; } - bch2_trans_init(&trans, c, 8, 2048); + trans = bch2_trans_get(c); bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, @@ -583,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, goto err; } - ret = commit_do(&trans, NULL, NULL, 0, - bch2_rename_trans(&trans, + ret = commit_do(trans, NULL, NULL, 0, + bch2_rename_trans(trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, &src_inode_u, @@ -599,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, + bch2_inode_update_after_write(trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); if (src_dir != dst_dir) - bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, + bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, + bch2_inode_update_after_write(trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) - bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, + bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_fs_quota_transfer(c, src_inode, bch_qid(&src_inode->ei_inode), @@ -676,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; @@ -697,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, if (ret) goto err; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); kfree(acl); acl = NULL; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -711,29 +709,29 @@ retry: bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } - ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); btree_err: - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); err_trans: - bch2_trans_exit(&trans); + bch2_trans_put(trans); err: mutex_unlock(&inode->ei_update_lock); @@ -755,9 +753,9 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->gid = inode->v.i_gid; stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); - stat->atime = inode->v.i_atime; - stat->mtime = inode->v.i_mtime; - stat->ctime = inode->v.i_ctime; + stat->atime = inode_get_atime(&inode->v); + stat->mtime = inode_get_mtime(&inode->v); + stat->ctime = inode_get_ctime(&inode->v); stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; @@ -766,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); } - if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) + if (inode->ei_inode.bi_flags & BCH_INODE_immutable) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) + if (inode->ei_inode.bi_flags & BCH_INODE_append) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; - if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) + if (inode->ei_inode.bi_flags & BCH_INODE_nodump) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; @@ -794,7 +792,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(idmap, inode, iattr) + ? bchfs_truncate(idmap, inode, iattr) : bch2_setattr_nonsize(idmap, inode, iattr); } @@ -875,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; @@ -896,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); if (ret) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(ei->v.i_ino, start, snapshot), 0); - while (!(ret = btree_trans_too_many_iters(&trans)) && + while (!(ret = btree_trans_too_many_iters(trans)) && (k = bch2_btree_iter_peek_upto(&iter, end)).k && !(ret = bkey_err(k))) { enum btree_id data_btree = BTREE_ID_extents; @@ -924,7 +922,7 @@ retry: bch2_bkey_buf_reassemble(&cur, c, k); - ret = bch2_read_indirect_extent(&trans, &data_btree, + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) break; @@ -943,7 +941,7 @@ retry: cur.k->k.p.offset += cur.k->k.size; if (have_extent) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 0); if (ret) @@ -957,18 +955,18 @@ retry: POS(iter.pos.inode, iter.pos.offset + sectors)); } start = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret && have_extent) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; @@ -1000,11 +998,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret; if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode_inum(inode), ctx); + ret = bch2_readdir(c, inode_inum(inode), ctx); + if (ret) + bch_err_fn(c, ret); + + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { @@ -1210,9 +1213,6 @@ static struct dentry *bch2_get_parent(struct dentry *child) .inum = inode->ei_inode.bi_dir, }; - if (!parent_inum.inum) - return NULL; - return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); } @@ -1221,7 +1221,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_inode_info *dir = to_bch_ei(parent->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter1; struct btree_iter iter2; struct bkey_s_c k; @@ -1229,29 +1229,30 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child struct bch_inode_unpacked inode_u; subvol_inum target; u32 snapshot; - unsigned name_len; + struct qstr dirent_name; + unsigned name_len = 0; int ret; if (!S_ISDIR(dir->v.i_mode)) return -EINVAL; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(&iter1, snapshot); bch2_btree_iter_set_snapshot(&iter2, snapshot); - ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; @@ -1269,7 +1270,7 @@ retry: } d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; if (ret) @@ -1291,7 +1292,7 @@ retry: continue; d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret < 0) break; if (ret) @@ -1306,17 +1307,18 @@ retry: ret = -ENOENT; goto err; found: - name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); + dirent_name = bch2_dirent_get_name(d); - memcpy(name, d.v->d_name, name_len); + name_len = min_t(unsigned, dirent_name.len, NAME_MAX); + memcpy(name, dirent_name.name, name_len); name[name_len] = '\0'; err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter1); - bch2_trans_iter_exit(&trans, &iter2); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter1); + bch2_trans_iter_exit(trans, &iter2); + bch2_trans_put(trans); return ret; } @@ -1406,15 +1408,16 @@ static void bch2_destroy_inode(struct inode *vinode) call_rcu(&vinode->i_rcu, bch2_i_callback); } -static int inode_update_times_fn(struct bch_inode_info *inode, +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); - bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); - bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); + bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); + bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); + bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); return 0; } @@ -1589,7 +1592,7 @@ static struct bch_fs *bch2_path_to_fs(const char *path) static char **split_devs(const char *_dev_name, unsigned *nr) { char *dev_name = NULL, **devs = NULL, *s; - size_t i, nr_devs = 0; + size_t i = 0, nr_devs = 0; dev_name = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) @@ -1604,9 +1607,7 @@ static char **split_devs(const char *_dev_name, unsigned *nr) return NULL; } - for (i = 0, s = dev_name; - s; - (s = strchr(s, ':')) && (*s++ = '\0')) + while ((s = strsep(&dev_name, ":"))) devs[i++] = s; *nr = nr_devs; @@ -1649,7 +1650,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) up_write(&c->state_lock); } - if (opts.errors >= 0) + if (opt_defined(opts, errors)) c->opts.errors = opts.errors; err: return bch2_err_class(ret); @@ -1710,6 +1711,35 @@ static void bch2_put_super(struct super_block *sb) __bch2_fs_stop(c); } +/* + * bcachefs doesn't currently integrate intwrite freeze protection but the + * internal write references serve the same purpose. Therefore reuse the + * read-only transition code to perform the quiesce. The caveat is that we don't + * currently have the ability to block tasks that want a write reference while + * the superblock is frozen. This is fine for now, but we should either add + * blocking support or find a way to integrate sb_start_intwrite() and friends. + */ +static int bch2_freeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); + return 0; +} + +static int bch2_unfreeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + down_write(&c->state_lock); + ret = bch2_fs_read_write(c); + up_write(&c->state_lock); + return ret; +} + static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .destroy_inode = bch2_destroy_inode, @@ -1721,10 +1751,8 @@ static const struct super_operations bch_super_operations = { .show_options = bch2_show_options, .remount_fs = bch2_remount, .put_super = bch2_put_super, -#if 0 .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, -#endif }; static int bch2_set_super(struct super_block *s, void *data) @@ -1878,7 +1906,7 @@ got_sb: vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); if (ret) { - bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "mounting: error getting root inode"); goto err_put_super; } diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 6170d21..5edf1d4 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -174,7 +174,8 @@ static inline int bch2_set_projid(struct bch_fs *c, struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct bch_inode_info *, +typedef int (*inode_set_fn)(struct btree_trans *, + struct bch_inode_info *, struct bch_inode_unpacked *, void *); void bch2_inode_update_after_write(struct btree_trans *, @@ -196,7 +197,7 @@ int bch2_vfs_init(void); #else -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 0852dbe..cc90279 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_update.h" #include "buckets.h" #include "darray.h" @@ -11,7 +12,8 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "subvolume.h" +#include "recovery.h" +#include "snapshot.h" #include "super.h" #include "xattr.h" @@ -79,7 +81,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, if (!ret) *subvol = le32_to_cpu(s.subvol); else if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not fonud", snapshot); + bch_err(trans->c, "snapshot %u not found", snapshot); return ret; } @@ -125,9 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "error fetching inode %llu: %s", - inode_nr, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -152,9 +152,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "error fetching inode %llu:%u: %s", - inode_nr, *snapshot, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -205,17 +203,16 @@ static int __write_inode(struct btree_trans *trans, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); } -static int write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) { int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, __write_inode(trans, inode, snapshot)); if (ret) - bch_err(trans->c, "error in fsck: error updating inode: %s", - bch2_err_str(ret)); + bch_err_fn(trans->c, ret); return ret; } @@ -240,8 +237,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -276,14 +272,13 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, goto create_lostfound; } - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) return ret; if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return ret; + return -BCH_ERR_ENOENT_not_directory; } /* @@ -299,8 +294,7 @@ create_lostfound: lostfound, &lostfound_str, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { }, 0); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating lost+found"); return ret; } @@ -360,15 +354,10 @@ static int reattach_inode(struct btree_trans *trans, u32 inode_snapshot) { int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_lazy_rw| + BCH_TRANS_COMMIT_no_enospc, __reattach_inode(trans, inode, inode_snapshot)); - if (ret) { - bch_err(trans->c, "error reattaching inode %llu: %s", - inode->bi_inum, bch2_err_str(ret)); - return ret; - } - + bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); return ret; } @@ -408,6 +397,28 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) memset(s, 0, sizeof(*s)); } +static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + struct snapshots_seen_entry *i, n = { + .id = id, + .equiv = bch2_snapshot_equiv(c, id), + }; + int ret = 0; + + darray_for_each(s->ids, i) { + if (i->id == id) + return 0; + if (i->id > id) + break; + } + + ret = darray_insert_item(&s->ids, i - s->ids.data, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; +} + static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { @@ -434,9 +445,10 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, if (i->equiv == n.equiv) { bch_err(c, "snapshot deletion did not finish:\n" " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_ids[btree_id], + bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } @@ -452,7 +464,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, * and @ancestor hasn't been overwritten in @seen * - * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + * @c: filesystem handle + * @seen: list of snapshot ids already seen at current position + * @id: descendent snapshot id + * @ancestor: ancestor snapshot id + * + * Returns: whether key in @ancestor snapshot is visible in @id snapshot */ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, u32 id, u32 ancestor) @@ -497,14 +514,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see * snapshot id @dst, test whether there is some snapshot in which @dst is * visible. * - * This assumes we're visiting @src keys in natural key order. + * @c: filesystem handle + * @s: list of snapshot IDs already seen at @src + * @src: snapshot ID of src key + * @dst: snapshot ID of dst key + * Returns: true if there is some snapshot in which @dst is visible * - * @s - list of snapshot IDs already seen at @src - * @src - snapshot ID of src key - * @dst - snapshot ID of dst key + * Assumes we're visiting @src keys in natural key order */ -static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, - u32 src, u32 dst) +static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) { return dst <= src ? key_visible_in_snapshot(c, s, dst, src) @@ -595,10 +614,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->first_this_inode = true; - if (trans_was_restarted(trans, restart_count)) - return -BCH_ERR_transaction_restart_nested; - - return 0; + return trans_was_restarted(trans, restart_count); } static struct inode_walker_entry * @@ -705,8 +721,9 @@ static int check_key_has_snapshot(struct btree_trans *trans, int ret = 0; if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + bkey_in_missing_snapshot, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; fsck_err: @@ -740,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans, BCH_HASH_SET_MUST_CREATE, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw); } static int hash_check_key(struct btree_trans *trans, @@ -775,6 +792,7 @@ static int hash_check_key(struct btree_trans *trans, if (fsck_err_on(k.k->type == desc.key_type && !desc.cmp_bkey(k, hash_k), c, + hash_table_key_duplicate, "duplicate hash table keys:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), @@ -793,13 +811,13 @@ out: printbuf_exit(&buf); return ret; bad_hash: - if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + if (fsck_err(c, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) return ret; ret = -BCH_ERR_transaction_restart_nested; @@ -830,52 +848,65 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - /* - * if snapshot id isn't a leaf node, skip it - deletion in - * particular is not atomic, so on the internal snapshot nodes - * we can see inodes marked for deletion after a clean shutdown - */ - if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) - return 0; - if (!bkey_is_inode(k.k)) return 0; BUG_ON(bch2_inode_unpack(k, &u)); if (!full && - !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) + !(u.bi_flags & (BCH_INODE_i_size_dirty| + BCH_INODE_i_sectors_dirty| + BCH_INODE_unlinked))) return 0; if (prev->bi_inum != u.bi_inum) *prev = u; if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || - inode_d_type(prev) != inode_d_type(&u), c, + inode_d_type(prev) != inode_d_type(&u), + c, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); return -EINVAL; } - if (u.bi_flags & BCH_INODE_UNLINKED && + if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && + bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); + if (ret) + goto err; + + u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; + + ret = __write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); + if (ret) + return ret; + + if (!bpos_eq(new_min_pos, POS_MIN)) + bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); + return 0; + } + + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + fsck_err(c, inode_unlinked_but_clean, + "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error in fsck: error while deleting inode: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "in fsck deleting inode"); return ret; } - if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + if (u.bi_flags & BCH_INODE_i_size_dirty && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", + fsck_err(c, inode_i_size_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_size dirty", u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); @@ -891,9 +922,7 @@ static int check_inode(struct btree_trans *trans, iter->pos.snapshot), POS(u.bi_inum, U64_MAX), 0, NULL); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error in fsck: error truncating inode: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "in fsck truncating inode"); if (ret) return ret; @@ -901,15 +930,16 @@ static int check_inode(struct btree_trans *trans, * We truncated without our normal sector accounting hook, just * make sure we recalculate it: */ - u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags |= BCH_INODE_i_sectors_dirty; - u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + u.bi_flags &= ~BCH_INODE_i_size_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + if (u.bi_flags & BCH_INODE_i_sectors_dirty && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", + fsck_err(c, inode_i_sectors_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_sectors dirty", u.bi_inum))) { s64 sectors; @@ -918,33 +948,31 @@ static int check_inode(struct btree_trans *trans, sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { - bch_err(c, "error in fsck: error recounting inode sectors: %s", - bch2_err_str(sectors)); + bch_err_msg(c, sectors, "in fsck recounting inode sectors"); return sectors; } u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags &= ~BCH_INODE_i_sectors_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { + if (u.bi_flags & BCH_INODE_backptr_untrusted) { u.bi_dir = 0; u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; + u.bi_flags &= ~BCH_INODE_backptr_untrusted; do_update = true; } if (do_update) { ret = __write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); if (ret) - bch_err(c, "error in fsck: error updating inode: %s", - bch2_err_str(ret)); + return ret; } err: fsck_err: - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -952,7 +980,7 @@ noinline_for_stack int bch2_check_inodes(struct bch_fs *c) { bool full = c->opts.fsck; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; @@ -960,18 +988,16 @@ int bch2_check_inodes(struct bch_fs *c) int ret; snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_inode(&trans, &iter, k, &prev, &s, full)); + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_inode(trans, &iter, k, &prev, &s, full)); - bch2_trans_exit(&trans); snapshots_seen_exit(&s); - if (ret) - bch_err_fn(c, ret); + bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -997,25 +1023,6 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d, : le64_to_cpu(d.v->d_inum) == inode->bi_inum; } -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; - - d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - ret = bkey_err(d); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - ret = dirent_points_to_inode(d, inode); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1039,22 +1046,20 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) return -BCH_ERR_internal_fsck_err; } - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, - i->inode.bi_sectors, i->count)) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), + c, inode_i_sectors_wrong, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, + i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = write_inode(trans, &i->inode, i->snapshot); + ret = fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } } fsck_err: - if (ret) - bch_err_fn(c, ret); - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; + bch_err_fn(c, ret); + return ret ?: trans_was_restarted(trans, restart_count); } struct extent_end { @@ -1122,7 +1127,8 @@ static int extent_ends_at(struct bch_fs *c, static int overlapping_extents_found(struct btree_trans *trans, enum btree_id btree, - struct bpos pos1, struct bkey pos2, + struct bpos pos1, struct snapshots_seen *pos1_seen, + struct bkey pos2, bool *fixed, struct extent_end *extent_end) { @@ -1185,7 +1191,8 @@ static int overlapping_extents_found(struct btree_trans *trans, prt_printf(&buf, "\n overwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - if (fsck_err(c, "overlapping extents%s", buf.buf)) { + if (fsck_err(c, extent_overlapping, + "overlapping extents%s", buf.buf)) { struct btree_iter *old_iter = &iter1; struct disk_reservation res = { 0 }; @@ -1200,7 +1207,7 @@ static int overlapping_extents_found(struct btree_trans *trans, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, k1, k2) ?: bch2_trans_commit(trans, &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); if (ret) @@ -1208,10 +1215,24 @@ static int overlapping_extents_found(struct btree_trans *trans, *fixed = true; - if (pos1.snapshot == pos2.p.snapshot) + if (pos1.snapshot == pos2.p.snapshot) { + /* + * We overwrote the first extent, and did the overwrite + * in the same snapshot: + */ extent_end->offset = bkey_start_offset(&pos2); - else + } else if (pos1.snapshot > pos2.p.snapshot) { + /* + * We overwrote the first extent in pos2's snapshot: + */ + ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); + } else { + /* + * We overwrote the second extent - restart + * check_extent() from the top: + */ ret = -BCH_ERR_transaction_restart_nested; + } } fsck_err: err: @@ -1253,6 +1274,7 @@ static int check_overlapping_extents(struct btree_trans *trans, SPOS(iter->pos.inode, i->offset, i->snapshot), + &i->seen, *k.k, fixed, i); if (ret) goto err; @@ -1267,6 +1289,28 @@ err: return ret; } +static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc_is_encoded(crc) && + crc.uncompressed_size > encoded_extent_max_sectors) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); + printbuf_exit(&buf); + } + + return 0; +} + static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, @@ -1303,7 +1347,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1311,7 +1355,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(i && !S_ISREG(i->inode.bi_mode) && - !S_ISLNK(i->inode.bi_mode), c, + !S_ISLNK(i->inode.bi_mode), + c, extent_in_non_reg_inode, "extent in non regular inode mode %o:\n %s", i->inode.bi_mode, (printbuf_reset(&buf), @@ -1341,9 +1386,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, continue; if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && - !bkey_extent_is_reservation(k), c, + !bkey_extent_is_reservation(k), + c, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", i->inode.bi_inum, i->snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1371,9 +1417,7 @@ out: err: fsck_err: printbuf_exit(&buf); - - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; delete: ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); @@ -1388,7 +1432,7 @@ int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct extent_ends extent_ends; @@ -1397,26 +1441,49 @@ int bch2_check_extents(struct bch_fs *c) snapshots_seen_init(&s); extent_ends_init(&extent_ends); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); - check_extent(&trans, &iter, k, &w, &s, &extent_ends); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); })) ?: - check_i_sectors(&trans, &w); + check_i_sectors(trans, &w); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); inode_walker_exit(&w); - bch2_trans_exit(&trans); snapshots_seen_exit(&s); + bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_indirect_extents(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct disk_reservation res = { 0 }; + int ret = 0; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + bch2_trans_put(trans); + + bch_err_fn(c, ret); return ret; } @@ -1444,21 +1511,19 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) continue; } - if (fsck_err_on(i->inode.bi_nlink != i->count, c, + if (fsck_err_on(i->inode.bi_nlink != i->count, + c, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = write_inode(trans, &i->inode, i->snapshot); + ret = fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } } fsck_err: - if (ret) - bch_err_fn(c, ret); - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; + bch_err_fn(c, ret); + return ret ?: trans_was_restarted(trans, restart_count); } static int check_dirent_target(struct btree_trans *trans, @@ -1469,8 +1534,8 @@ static int check_dirent_target(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_dirent *n; - bool backpointer_exists = true; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (!target->bi_dir && @@ -1484,34 +1549,47 @@ static int check_dirent_target(struct btree_trans *trans, } if (!inode_points_to_dirent(target, d)) { - ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); - if (ret < 0) + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - backpointer_exists = ret; + bool backpointer_exists = !ret; ret = 0; - if (fsck_err_on(S_ISDIR(target->bi_mode) && - backpointer_exists, c, - "directory %llu with multiple links", - target->bi_inum)) { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, + c, inode_dir_multiple_links, + "directory %llu:%u with multiple links\n%s", + target->bi_inum, target_snapshot, buf.buf)) { ret = __remove_dirent(trans, d.k->p); goto out; } - if (fsck_err_on(backpointer_exists && - !target->bi_nlink, c, - "inode %llu type %s has multiple links but i_nlink 0", - target->bi_inum, bch2_d_types[d.v->d_type])) { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_UNLINKED; + target->bi_flags &= ~BCH_INODE_unlinked; ret = __write_inode(trans, target, target_snapshot); if (ret) goto err; } - if (fsck_err_on(!backpointer_exists, c, + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, "inode %llu:%u has wrong backpointer:\n" "got %llu:%llu\n" "should be %llu:%llu", @@ -1529,7 +1607,8 @@ static int check_dirent_target(struct btree_trans *trans, } } - if (fsck_err_on(d.v->d_type != inode_d_type(target), c, + if (fsck_err_on(d.v->d_type != inode_d_type(target), + c, dirent_d_type_wrong, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), @@ -1553,7 +1632,8 @@ static int check_dirent_target(struct btree_trans *trans, if (d.v->d_type == DT_SUBVOL && target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && (c->sb.version < bcachefs_metadata_version_subvol_dirent || - fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + fsck_err(c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", le32_to_cpu(d.v->d_parent_subvol), target->bi_parent_subvol))) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); @@ -1573,10 +1653,9 @@ static int check_dirent_target(struct btree_trans *trans, out: err: fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); - - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1627,7 +1706,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1639,7 +1718,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (!i) goto out; - if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), + c, dirent_in_non_dir_inode, "dirent in non directory inode type %s:\n%s", bch2_d_type_str(inode_d_type(&i->inode)), (printbuf_reset(&buf), @@ -1673,7 +1753,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - if (fsck_err_on(ret, c, + if (fsck_err_on(ret, c, dirent_to_missing_subvol, "dirent points to missing subvolume %u", le32_to_cpu(d.v->d_child_subvol))) { ret = __remove_dirent(trans, d.k->p); @@ -1685,7 +1765,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - if (fsck_err_on(ret, c, + if (fsck_err_on(ret, c, subvol_to_missing_root, "subvolume %u points to missing subvolume root %llu", target_subvol, target_inum)) { @@ -1694,7 +1774,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, + c, subvol_root_wrong_bi_subvol, "subvol root %llu has wrong bi_subvol field: got %u, should be %u", target_inum, subvol_root.bi_subvol, target_subvol)) { @@ -1713,7 +1794,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (fsck_err_on(!target->inodes.nr, c, + if (fsck_err_on(!target->inodes.nr, + c, dirent_to_missing_inode, "dirent points to missing inode: (equiv %u)\n%s", equiv.snapshot, (printbuf_reset(&buf), @@ -1740,9 +1822,7 @@ out: err: fsck_err: printbuf_exit(&buf); - - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1756,29 +1836,26 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); + BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1804,7 +1881,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); inode->first_this_inode = false; - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, xattr_in_missing_inode, "xattr for missing inode %llu", k.k->p.inode)) return bch2_btree_delete_at(trans, iter, 0); @@ -1814,8 +1891,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); fsck_err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1826,25 +1902,19 @@ int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_xattr(&trans, &iter, k, &hash_info, &inode)); - - bch2_trans_exit(&trans); - - if (ret) - bch_err_fn(c, ret); + BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_xattr(trans, &iter, k, &hash_info, &inode))); + bch_err_fn(c, ret); return ret; } @@ -1860,7 +1930,8 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + if (mustfix_fsck_err_on(ret, c, root_subvol_missing, + "root subvol missing")) { struct bkey_i_subvolume root_subvol; snapshot = U32_MAX; @@ -1872,14 +1943,13 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(trans, BTREE_ID_subvolumes, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, + bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0)); - if (ret) { - bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "writing root subvol"); + if (ret) goto err; - } } @@ -1887,16 +1957,17 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, "root directory missing") || - mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + if (mustfix_fsck_err_on(ret, c, root_dir_missing, + "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), + c, root_inode_not_dir, "root inode not a directory")) { bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); root_inode.bi_inum = inum; ret = __write_inode(trans, &root_inode, snapshot); - if (ret) - bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "writing root inode"); } err: fsck_err: @@ -1909,12 +1980,10 @@ int bch2_check_root(struct bch_fs *c) int ret; ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - check_root_trans(&trans)); - - if (ret) - bch_err_fn(c, ret); + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, + check_root_trans(trans)); + bch_err_fn(c, ret); return ret; } @@ -1995,7 +2064,8 @@ static int check_path(struct btree_trans *trans, } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + if (fsck_err(c, inode_unreachable, + "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", inode->bi_inum, snapshot, bch2_d_type_str(inode_d_type(inode)), inode->bi_nlink, @@ -2035,12 +2105,13 @@ static int check_path(struct btree_trans *trans, pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); - if (!fsck_err(c, "directory structure loop")) + if (!fsck_err(c, dir_loop, + "directory structure loop")) return 0; ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, remove_backpointer(trans, inode)); if (ret) { bch_err(c, "error removing dirent: %i", ret); @@ -2051,8 +2122,7 @@ static int check_path(struct btree_trans *trans, } } fsck_err: - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -2063,16 +2133,14 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -2086,24 +2154,20 @@ int bch2_check_directory_structure(struct bch_fs *c) break; } - if (u.bi_flags & BCH_INODE_UNLINKED) + if (u.bi_flags & BCH_INODE_unlinked) continue; - ret = check_path(&trans, &path, &u, iter.pos.snapshot); + ret = check_path(trans, &path, &u, iter.pos.snapshot); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); darray_exit(&path); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } -/* check_nlink pass: */ - struct nlink_table { size_t nr; size_t size; @@ -2150,7 +2214,7 @@ static int nlink_cmp(const void *_l, const void *_r) const struct nlink *l = _l; const struct nlink *r = _r; - return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + return cmp_int(l->inum, r->inum); } static void inc_link(struct bch_fs *c, struct snapshots_seen *s, @@ -2185,15 +2249,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_inodes, + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| @@ -2222,8 +2284,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, } } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); if (ret) bch_err(c, "error in fsck: btree error %i while walking inodes", ret); @@ -2235,7 +2297,7 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; struct btree_iter iter; struct bkey_s_c k; @@ -2244,9 +2306,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -2266,12 +2326,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links break; } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); - bch2_trans_exit(&trans); + bch2_trans_put(trans); snapshots_seen_exit(&s); return ret; } @@ -2306,7 +2366,8 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite link = &links->d[++*idx]; } - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, + if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, + c, inode_wrong_nlink, "inode %llu type %s has wrong i_nlink (%u, should be %u)", u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { @@ -2322,24 +2383,19 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; size_t idx = 0; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); - - bch2_trans_exit(&trans); - + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); return ret; } @@ -2376,9 +2432,7 @@ int bch2_check_nlinks(struct bch_fs *c) } while (next_iter_range_start != U64_MAX); kvfree(links.d); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -2419,14 +2473,12 @@ int bch2_fix_reflink_p(struct bch_fs *c) return 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - fix_reflink_p_key(&trans, &iter, k))); - - if (ret) - bch_err_fn(c, ret); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + fix_reflink_p_key(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index 90c87b5..da991e8 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -4,6 +4,7 @@ int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); +int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index fea21e1..b9d6dbf 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -6,11 +6,13 @@ #include "bkey_methods.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "error.h" #include "extents.h" #include "extent_update.h" #include "inode.h" #include "str_hash.h" +#include "snapshot.h" #include "subvolume.h" #include "varint.h" @@ -18,13 +20,18 @@ #include -const char * const bch2_inode_opts[] = { #define x(name, ...) #name, +const char * const bch2_inode_opts[] = { BCH_INODE_OPTS() -#undef x NULL, }; +static const char * const bch2_inode_flag_strs[] = { + BCH_INODE_FLAGS() + NULL +}; +#undef x + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -119,8 +126,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; - int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), - &unpacked); + ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); BUG_ON(ret); BUG_ON(unpacked.bi_inum != inode->bi_inum); BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); @@ -317,7 +323,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -int bch2_inode_peek(struct btree_trans *trans, +static int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) @@ -351,9 +357,20 @@ err: return ret; } -int bch2_inode_write(struct btree_trans *trans, +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); + bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); + return ret; +} + +int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + enum btree_update_flags flags) { struct bkey_inode_buf *inode_p; @@ -363,7 +380,7 @@ int bch2_inode_write(struct btree_trans *trans, bch2_inode_pack_inlined(inode_p, inode); inode_p->inode.k.p.snapshot = iter->snapshot; - return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) @@ -387,117 +404,121 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bch_inode_unpacked unpacked; + int ret = 0; - if (k.k->p.inode) { - prt_printf(err, "nonzero k.p.inode"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); - if (k.k->p.offset < BLOCKDEV_INODE_MAX) { - prt_printf(err, "fs inode in blockdev range"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, + inode_pos_blockdev_range, + "fs inode in blockdev range"); - if (bch2_inode_unpack(k, &unpacked)) { - prt_printf(err, "invalid variable length fields"); - return -BCH_ERR_invalid_bkey; - } - - if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { - prt_printf(err, "invalid data checksum type (%u >= %u", - unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, + inode_unpack_error, + "invalid variable length fields"); - if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { - prt_printf(err, "invalid data checksum type (%u >= %u)", - unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, + inode_checksum_type_invalid, + "invalid data checksum type (%u >= %u", + unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && - unpacked.bi_nlink != 0) { - prt_printf(err, "flagged as unlinked but bi_nlink != 0"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(unpacked.bi_compression && + !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, + inode_compression_type_invalid, + "invalid compression opt %u", unpacked.bi_compression - 1); - if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { - prt_printf(err, "subvolume root but not a directory"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && + unpacked.bi_nlink != 0, c, err, + inode_unlinked_but_nlink_nonzero, + "flagged as unlinked but bi_nlink != 0"); - return 0; + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, + inode_subvol_root_but_not_dir, + "subvolume root but not a directory"); +fsck_err: + return ret; } -int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + int ret = 0; - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } -int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + int ret = 0; - if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } -int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + int ret = 0; - if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { - prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", - INODEv3_FIELDS_START(inode.v), - INODEv3_FIELDS_START_INITIAL, - bkey_val_u64s(inode.k)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, + inode_v3_fields_start_bad, + "invalid fields_start (got %llu, min %u max %zu)", + INODEv3_FIELDS_START(inode.v), + INODEv3_FIELDS_START_INITIAL, + bkey_val_u64s(inode.k)); - if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", - inode->bi_mode, inode->bi_flags, + prt_printf(out, "mode=%o ", inode->bi_mode); + + prt_str(out, "flags="); + prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); + prt_printf(out, " (%x)", inode->bi_flags); + + prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", inode->bi_journal_seq, inode->bi_size, inode->bi_sectors, inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name " %llu", (u64) inode->_name); + prt_printf(out, " "#_name "=%llu", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x } @@ -520,23 +541,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c __bch2_inode_unpacked_to_text(out, &inode); } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline u64 bkey_inode_flags(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_inode: - return bkey_s_c_to_inode(k).v->bi_flags & - cpu_to_le32(BCH_INODE_UNLINKED); + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); case KEY_TYPE_inode_v2: - return bkey_s_c_to_inode_v2(k).v->bi_flags & - cpu_to_le32(BCH_INODE_UNLINKED); + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); case KEY_TYPE_inode_v3: - return bkey_s_c_to_inode_v3(k).v->bi_flags & - cpu_to_le64(BCH_INODE_UNLINKED); + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); default: - return false; + return 0; } } +static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +{ + return bkey_inode_flags(k) & BCH_INODE_unlinked; +} + int bch2_trans_mark_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, @@ -598,16 +621,17 @@ int bch2_mark_inode(struct btree_trans *trans, return 0; } -int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->p.inode) { - prt_printf(err, "nonzero k.p.inode"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); +fsck_err: + return ret; } void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, @@ -768,6 +792,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct bkey_i delete; + struct bpos end = POS(inum.inum, U64_MAX); u32 snapshot; int ret = 0; @@ -776,7 +801,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_INTENT); while (1) { bch2_trans_begin(trans); @@ -787,7 +812,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + k = bch2_btree_iter_peek_upto(&iter, end); ret = bkey_err(k); if (ret) goto err; @@ -798,9 +823,14 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; @@ -812,7 +842,7 @@ err: int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; @@ -820,8 +850,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) u32 snapshot; int ret; - bch2_trans_init(&trans, c, 0, 1024); - /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -830,19 +858,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); + ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes, + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), BTREE_ITER_INTENT|BTREE_ITER_CACHED); ret = bkey_err(k); @@ -850,7 +878,7 @@ retry: goto err; if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(trans.c, + bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); ret = -EIO; @@ -863,15 +891,28 @@ retry: delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); err: - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -892,13 +933,13 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inum, inode)); + bch2_inode_find_by_inum_trans(trans, inum, inode)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; + if (bi->bi_flags & BCH_INODE_unlinked) + bi->bi_flags &= ~BCH_INODE_unlinked; else { if (bi->bi_nlink == U32_MAX) return -EINVAL; @@ -911,13 +952,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) { - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", bi->bi_inum); return; } - if (bi->bi_flags & BCH_INODE_UNLINKED) { + if (bi->bi_flags & BCH_INODE_unlinked) { bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); return; } @@ -925,7 +966,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked * if (bi->bi_nlink) bi->bi_nlink--; else - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) @@ -950,6 +991,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; } +int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +{ + struct bch_inode_unpacked inode; + int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + + if (ret) + return ret; + + bch2_inode_opts_get(opts, trans->c, &inode); + return 0; +} + int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; @@ -1004,7 +1057,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1013,60 +1066,98 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } -static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) +static int may_delete_deleted_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos, + bool *need_another_pass) { struct bch_fs *c = trans->c; - struct btree_iter iter; + struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; int ret; - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) - return 0; - - if (!fsck_err_on(c->sb.clean, c, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) return ret; ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; if (fsck_err_on(!bkey_is_inode(k.k), c, + deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; ret = bch2_inode_unpack(k, &inode); if (ret) - goto err; + goto out; + + if (fsck_err_on(S_ISDIR(inode.bi_mode), c, + deleted_inode_is_dir, + "directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, + deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; - return 1; -err: + if (c->sb.clean && + !fsck_err(c, + deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } + + if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); + if (ret) + goto out; + + inode.bi_flags &= ~BCH_INODE_unlinked; + + ret = bch2_inode_write_flags(trans, &inode_iter, &inode, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch_err_msg(c, ret, "clearing inode unlinked flag"); + if (ret) + goto out; + + /* + * We'll need another write buffer flush to pick up the new + * unlinked inodes in the snapshot leaves: + */ + *need_another_pass = true; + goto out; + } + + ret = 1; +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + goto out; } int bch2_delete_dead_inodes(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; + bool need_another_pass; int ret; +again: + need_another_pass = false; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush_sync(&trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1076,26 +1167,34 @@ int bch2_delete_dead_inodes(struct bch_fs *c) * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); if (ret < 0) break; if (ret) { if (!test_bit(BCH_FS_RW, &c->flags)) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); } - ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot); + bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); + + if (!ret && need_another_pass) + goto again; err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 22b2440..88818a3 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -3,16 +3,17 @@ #define _BCACHEFS_INODE_H #include "bkey.h" +#include "bkey_methods.h" #include "opts.h" enum bkey_invalid_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -52,7 +53,7 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -101,8 +102,16 @@ void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *) int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); -int bch2_inode_write(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *); + +int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, enum btree_update_flags); + +static inline int bch2_inode_write(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode) +{ + return bch2_inode_write_flags(trans, iter, inode, 0); +} void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); @@ -118,6 +127,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, @@ -174,7 +186,7 @@ static inline unsigned nlink_bias(umode_t mode) static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) { - return bi->bi_flags & BCH_INODE_UNLINKED + return bi->bi_flags & BCH_INODE_unlinked ? 0 : bi->bi_nlink + nlink_bias(bi->bi_mode); } @@ -184,10 +196,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, { if (nlink) { bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_UNLINKED; + bi->bi_flags &= ~BCH_INODE_unlinked; } else { bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } } @@ -197,6 +209,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); +int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/libbcachefs/io.c b/libbcachefs/io.c deleted file mode 100644 index 5bacc6a..0000000 --- a/libbcachefs/io.c +++ /dev/null @@ -1,3059 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Some low level IO code, and hacks for various block layer limitations - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "compress.h" -#include "clock.h" -#include "data_update.h" -#include "debug.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "extent_update.h" -#include "inode.h" -#include "io.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include -#include - -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - rcu_read_lock(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - rcu_read_unlock(); - - return bch2_rand_range(nr * CONGESTED_MAX) < total; -} - -static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, - u64 now, int rw) -{ - u64 latency_capable = - ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; - /* ideally we'd be taking into account the device's variance here: */ - u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); - s64 latency_over = io_latency - latency_threshold; - - if (latency_threshold && latency_over > 0) { - /* - * bump up congested by approximately latency_over * 4 / - * latency_threshold - we don't need much accuracy here so don't - * bother with the divide: - */ - if (atomic_read(&ca->congested) < CONGESTED_MAX) - atomic_add(latency_over >> - max_t(int, ilog2(latency_threshold) - 2, 0), - &ca->congested); - - ca->congested_last = now; - } else if (atomic_read(&ca->congested) > 0) { - atomic_dec(&ca->congested); - } -} - -void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -{ - atomic64_t *latency = &ca->cur_latency[rw]; - u64 now = local_clock(); - u64 io_latency = time_after64(now, submit_time) - ? now - submit_time - : 0; - u64 old, new, v = atomic64_read(latency); - - do { - old = v; - - /* - * If the io latency was reasonably close to the current - * latency, skip doing the update and atomic operation - most of - * the time: - */ - if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0U << 5)) - break; - - new = ewma_add(old, io_latency, 5); - } while ((v = atomic64_cmpxchg(latency, old, new)) != old); - - bch2_congested_acct(ca, io_latency, now, rw); - - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); -} - -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - -#endif - -/* Allocate, free from mempool: */ - -void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -{ - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); - bio->bi_vcnt = 0; -} - -static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -{ - struct page *page; - - if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) { - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - goto pool_alloc; - - } - } else { -pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); - } - - return page; -} - -void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t size) -{ - bool using_mempool = false; - - while (size) { - struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - BUG_ON(!bio_add_page(bio, page, len, 0)); - size -= len; - } - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); -} - -/* Extent update path: */ - -int bch2_sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool *usage_increasing, - s64 *i_sectors_delta, - s64 *disk_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); - bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); - int ret = 0; - - *usage_increasing = false; - *i_sectors_delta = 0; - *disk_sectors_delta = 0; - - bch2_trans_copy_iter(&iter, extent_iter); - - for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k)); - - *i_sectors_delta += sectors * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); - *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot - ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) - : 0; - - if (!*usage_increasing && - (new->k.p.snapshot != old.k->p.snapshot || - new_replicas > bch2_bkey_replicas(c, old) || - (!new_compressed && bch2_bkey_sectors_compressed(old)))) - *usage_increasing = true; - - if (bkey_ge(old.k->p, new->k.p)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - struct btree_iter *extent_iter, - u64 new_i_size, - s64 i_sectors_delta) -{ - struct btree_iter iter; - struct bkey_i *k; - struct bkey_i_inode_v3 *inode; - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; - int ret; - - k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_CACHED); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; - - if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - goto err; - } - - inode = bkey_i_to_inode_v3(k); - - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > le64_to_cpu(inode->v.bi_size)) { - inode->v.bi_size = cpu_to_le64(new_i_size); - inode_update_flags = 0; - } - - if (i_sectors_delta) { - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); - inode_update_flags = 0; - } - - if (inode->k.p.snapshot != iter.snapshot) { - inode->k.p.snapshot = iter.snapshot; - inode_update_flags = 0; - } - - ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - inode_update_flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_extent_update(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 new_i_size, - s64 *i_sectors_delta_total, - bool check_enospc) -{ - struct bpos next_pos; - bool usage_increasing; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - int ret; - - /* - * This traverses us the iterator without changing iter->path->pos to - * search_key() (which is pos + 1 for extents): we want there to be a - * path already traversed at iter->pos because - * bch2_trans_extent_update() will use it to attempt extent merging - */ - ret = __bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(trans, iter, k); - if (ret) - return ret; - - next_pos = k->k.p; - - ret = bch2_sum_sector_overwrites(trans, iter, k, - &usage_increasing, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - return ret; - - if (disk_res && - disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, - disk_sectors_delta - disk_res->sectors, - !check_enospc || !usage_increasing - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - return ret; - } - - /* - * Note: - * We always have to do an inode update - even when i_size/i_sectors - * aren't changing - for fsync to work properly; fsync relies on - * inode->bi_journal_seq which is updated by the trigger code: - */ - ret = bch2_extent_update_i_size_sectors(trans, iter, - min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: - bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL); - if (unlikely(ret)) - return ret; - - if (i_sectors_delta_total) - *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(iter, next_pos); - return 0; -} - -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - unsigned sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated; - bool have_reservation = false; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - open_buckets.nr = 0; -retry: - sectors_allocated = 0; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto out; - - bch2_bkey_buf_reassemble(&old, c, k); - } - - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto out; - - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - struct bch_extent_ptr *ptr; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (ret) { - bch2_trans_unlock(trans); - closure_sync(&cl); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - goto retry; - return ret; - } - - sectors = min(sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - have_reservation = true; - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -out: - if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto retry; - } - - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - return ret; -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(iter, snapshot); - - /* - * peek_upto() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_upto(iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans trans; - struct btree_iter iter; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_INTENT); - - ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - -static int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct bkey_buf sk; - struct keylist *keys = &op->insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans trans; - struct btree_iter iter; - subvol_inum inum = { - .subvol = op->subvol, - .inum = k->k.p.inode, - }; - int ret; - - BUG_ON(!inum.subvol); - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - do { - bch2_trans_begin(&trans); - - k = bch2_keylist_front(keys); - bch2_bkey_buf_copy(&sk, c, k); - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, - &sk.k->k.p.snapshot); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - ret = bch2_extent_update(&trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); - bch2_trans_iter_exit(&trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_ge(iter.pos, k->k.p)) - bch2_keylist_pop_front(&op->insert_keys); - else - bch2_cut_front(iter.pos, k); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - - return ret; -} - -/* Writes */ - -void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - enum bch_data_type type, - const struct bkey_i *k, - bool nocow) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - const struct bch_extent_ptr *ptr; - struct bch_write_bio *n; - struct bch_dev *ca; - - BUG_ON(c->opts.nochanges); - - bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || - !c->devs[ptr->dev]); - - ca = bch_dev_bkey_exists(c, ptr->dev); - - if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; - n->bio.bi_private = wbio->bio.bi_private; - n->parent = wbio; - n->split = true; - n->bounce = false; - n->put_bio = true; - n->bio.bi_opf = wbio->bio.bi_opf; - bio_inc_remaining(&wbio->bio); - } else { - n = wbio; - n->split = false; - } - - n->c = c; - n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); - n->nocow = nocow; - n->submit_time = local_clock(); - n->inode_offset = bkey_start_offset(&k->k); - n->bio.bi_iter.bi_sector = ptr->offset; - - if (likely(n->have_ioref)) { - this_cpu_add(ca->io_done->sectors[WRITE][type], - bio_sectors(&n->bio)); - - bio_set_dev(&n->bio, ca->disk_sb.bdev); - - if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { - bio_endio(&n->bio); - continue; - } - - submit_bio(&n->bio); - } else { - n->bio.bi_status = BLK_STS_REMOVED; - bio_endio(&n->bio); - } - } -} - -static void __bch2_write(struct bch_write_op *); - -static void bch2_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - - bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - - EBUG_ON(cl->parent); - closure_debug_destroy(cl); - if (op->end_io) - op->end_io(op); -} - -static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) -{ - struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; - struct bkey_i *src, *dst = keys->keys, *n; - - for (src = keys->keys; src != keys->top; src = n) { - n = bkey_next(src); - - if (bkey_extent_is_direct_data(&src->k)) { - bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, - test_bit(ptr->dev, op->failed.d)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; - } - - if (dst != src) - memmove_u64s_down(dst, src, src->k.u64s); - dst = bkey_next(dst); - } - - keys->top = dst; - return 0; -} - -/** - * bch_write_index - after a write, update index to point to new data - */ -static void __bch2_write_index(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - struct bkey_i *k; - unsigned dev; - int ret = 0; - - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; - } - - /* - * probably not the ideal place to hook this in, but I don't - * particularly want to plumb io_opts all the way through the btree - * update stack right now - */ - for_each_keylist_key(keys, k) - bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); - - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - - ret = !(op->flags & BCH_WRITE_MOVE) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - BUG_ON(keylist_sectors(keys) && !ret); - - op->written += sectors_start - keylist_sectors(keys); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *k = bch2_keylist_front(&op->insert_keys); - - bch_err_inum_offset_ratelimited(c, - k->k.p.inode, k->k.p.offset << 9, - "write error while doing btree update: %s", - bch2_err_str(ret)); - } - - if (ret) - goto err; - } -out: - /* If some a bucket wasn't written, we can't erasure code it: */ - for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); - - bch2_open_buckets_put(c, &op->open_buckets); - return; -err: - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_DONE; - goto out; -} - -static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) -{ - if (state != wp->state) { - u64 now = ktime_get_ns(); - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) - wp->time[wp->state] += now - wp->last_state_change; - wp->state = state; - wp->last_state_change = now; - } -} - -static inline void wp_update_state(struct write_point *wp, bool running) -{ - enum write_point_state state; - - state = running ? WRITE_POINT_running : - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - - __wp_update_state(wp, state); -} - -static void bch2_write_index(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct write_point *wp = op->wp; - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - - if ((op->flags & BCH_WRITE_DONE) && - (op->flags & BCH_WRITE_MOVE)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); - if (wp->state == WRITE_POINT_waiting_io) - __wp_update_state(wp, WRITE_POINT_waiting_work); - list_add_tail(&op->wp_list, &wp->writes); - spin_unlock_irqrestore (&wp->writes_lock, flags); - - queue_work(wq, &wp->index_update_work); -} - -static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) -{ - op->wp = wp; - - if (wp->state == WRITE_POINT_stopped) { - spin_lock_irq(&wp->writes_lock); - __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock_irq(&wp->writes_lock); - } -} - -void bch2_write_point_do_index_updates(struct work_struct *work) -{ - struct write_point *wp = - container_of(work, struct write_point, index_update_work); - struct bch_write_op *op; - - while (1) { - spin_lock_irq(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op) - list_del(&op->wp_list); - wp_update_state(wp, op != NULL); - spin_unlock_irq(&wp->writes_lock); - - if (!op) - break; - - op->flags |= BCH_WRITE_IN_WORKER; - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_DONE)) - __bch2_write(op); - else - bch2_write_done(&op->cl); - } -} - -static void bch2_write_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; - } - - if (wbio->nocow) - set_bit(wbio->dev, op->devs_need_flush->d); - - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); - percpu_ref_put(&ca->io_ref); - } - - if (wbio->bounce) - bch2_bio_free_pages_pool(c, bio); - - if (wbio->put_bio) - bio_put(bio); - - if (parent) - bio_endio(&parent->bio); - else - closure_put(cl); -} - -static void init_append_extent(struct bch_write_op *op, - struct write_point *wp, - struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - struct bkey_i_extent *e; - - op->pos.offset += crc.uncompressed_size; - - e = bkey_extent_init(op->insert_keys.top); - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.version = version; - - if (crc.csum_type || - crc.compression_type || - crc.nonce) - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); - - bch2_keylist_push(&op->insert_keys); -} - -static struct bio *bch2_write_bio_alloc(struct bch_fs *c, - struct write_point *wp, - struct bio *src, - bool *page_alloc_failed, - void *buf) -{ - struct bch_write_bio *wbio; - struct bio *bio; - unsigned output_available = - min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available + - (buf - ? ((unsigned long) buf & (PAGE_SIZE - 1)) - : 0), PAGE_SIZE); - - pages = min(pages, BIO_MAX_VECS); - - bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOFS, &c->bio_write); - wbio = wbio_init(bio); - wbio->put_bio = true; - /* copy WRITE_SYNC flag */ - wbio->bio.bi_opf = src->bi_opf; - - if (buf) { - bch2_bio_map(bio, buf, output_available); - return bio; - } - - wbio->bounce = true; - - /* - * We can't use mempool for more than c->sb.encoded_extent_max - * worth of pages, but we'd like to allocate more if we can: - */ - bch2_bio_alloc_pages_pool(c, bio, - min_t(unsigned, output_available, - c->opts.encoded_extent_max)); - - if (bio->bi_iter.bi_size < output_available) - *page_alloc_failed = - bch2_bio_alloc_pages(bio, - output_available - - bio->bi_iter.bi_size, - GFP_NOFS) != 0; - - return bio; -} - -static int bch2_write_rechecksum(struct bch_fs *c, - struct bch_write_op *op, - unsigned new_csum_type) -{ - struct bio *bio = &op->wbio.bio; - struct bch_extent_crc_unpacked new_crc; - int ret; - - /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ - - if (bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)) - new_csum_type = op->crc.csum_type; - - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); - if (ret) - return ret; - - bio_advance(bio, op->crc.offset << 9); - bio->bi_iter.bi_size = op->crc.live_size << 9; - op->crc = new_crc; - return 0; -} - -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum)) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -{ - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) - return PREP_ENCODED_OK; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - - /* Can we just write the entire extent as is? */ - if (op->crc.uncompressed_size == op->crc.live_size && - op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || - op->incompressible)) { - if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; - - return PREP_ENCODED_DO_WRITE; - } - - /* - * If the data is compressed and we couldn't write the entire extent as - * is, we have to decompress it: - */ - if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; - - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) - return PREP_ENCODED_ERR; - } - - /* - * No longer have compressed data after this point - data might be - * encrypted: - */ - - /* - * If the data is checksummed and we're only writing a subset, - * rechecksum and adjust bio to point to currently live data: - */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; - - /* - * If we want to compress the data, it has to be decrypted: - */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - - return PREP_ENCODED_OK; -} - -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - struct bio **_dst) -{ - struct bch_fs *c = op->c; - struct bio *src = &op->wbio.bio, *dst = src; - struct bvec_iter saved_iter; - void *ec_buf; - unsigned total_output = 0, total_input = 0; - bool bounce = false; - bool page_alloc_failed = false; - int ret, more = 0; - - BUG_ON(!bio_sectors(src)); - - ec_buf = bch2_writepoint_ec_buf(c, wp); - - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; - } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; - } - - if (ec_buf || - op->compression_opt || - (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || - (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } - - saved_iter = dst->bi_iter; - - do { - struct bch_extent_crc_unpacked crc = { 0 }; - struct bversion version = op->version; - size_t dst_len, src_len; - - if (page_alloc_failed && - dst->bi_iter.bi_size < (wp->sectors_free << 9) && - dst->bi_iter.bi_size < c->opts.encoded_extent_max) - break; - - BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - - crc.compression_type = op->incompressible - ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_opt - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_opt) - : 0; - if (!crc_is_compressed(crc)) { - dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); - - if (op->csum_type) - dst_len = min_t(unsigned, dst_len, - c->opts.encoded_extent_max); - - if (bounce) { - swap(dst->bi_iter.bi_size, dst_len); - bio_copy_data(dst, src); - swap(dst->bi_iter.bi_size, dst_len); - } - - src_len = dst_len; - } - - BUG_ON(!src_len || !dst_len); - - if (bch2_csum_type_is_encryption(op->csum_type)) { - if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version); - } else { - crc.nonce = op->nonce; - op->nonce += src_len >> 9; - } - } - - if ((op->flags & BCH_WRITE_DATA_ENCODED) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { - u8 compression_type = crc.compression_type; - u16 nonce = crc.nonce; - /* - * Note: when we're using rechecksum(), we need to be - * checksumming @src because it has all the data our - * existing checksum covers - if we bounced (because we - * were trying to compress), @dst will only have the - * part of the data the new checksum will cover. - * - * But normally we want to be checksumming post bounce, - * because part of the reason for bouncing is so the - * data can't be modified (by userspace) while it's in - * flight. - */ - if (bch2_rechecksum_bio(c, src, version, op->crc, - &crc, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->csum_type)) - goto csum_err; - /* - * rchecksum_bio sets compression_type on crc from op->crc, - * this isn't always correct as sometimes we're changing - * an extent from uncompressed to incompressible. - */ - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && - bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->crc.csum_type)) - goto csum_err; - - crc.compressed_size = dst_len >> 9; - crc.uncompressed_size = src_len >> 9; - crc.live_size = src_len >> 9; - - swap(dst->bi_iter.bi_size, dst_len); - ret = bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - if (ret) - goto err; - - crc.csum = bch2_checksum_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - crc.csum_type = op->csum_type; - swap(dst->bi_iter.bi_size, dst_len); - } - - init_append_extent(op, wp, version, crc); - - if (dst != src) - bio_advance(dst, dst_len); - bio_advance(src, src_len); - total_output += dst_len; - total_input += src_len; - } while (dst->bi_iter.bi_size && - src->bi_iter.bi_size && - wp->sectors_free && - !bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); - - more = src->bi_iter.bi_size != 0; - - dst->bi_iter = saved_iter; - - if (dst == src && more) { - BUG_ON(total_output != total_input); - - dst = bio_split(src, total_input >> 9, - GFP_NOFS, &c->bio_write); - wbio_init(dst)->put_bio = true; - /* copy WRITE_SYNC flag */ - dst->bi_opf = src->bi_opf; - } - - dst->bi_iter.bi_size = total_output; -do_write: - *_dst = dst; - return more; -csum_err: - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); - ret = -EIO; -err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); - if (to_wbio(dst)->put_bio) - bio_put(dst); - - return ret; -} - -static bool bch2_extent_is_writeable(struct bch_write_op *op, - struct bkey_s_c k) -{ - struct bch_fs *c = op->c; - struct bkey_s_c_extent e; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - unsigned replicas = 0; - - if (k.k->type != KEY_TYPE_extent) - return false; - - e = bkey_s_c_to_extent(k); - extent_for_each_ptr_decode(e, p, entry) { - if (p.crc.csum_type || - crc_is_compressed(p.crc) || - p.has_ec) - return false; - - replicas += bch2_extent_ptr_durability(c, &p); - } - - return replicas >= op->opts.data_replicas; -} - -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - const struct bch_extent_ptr *ptr; - struct bkey_i *k; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - -static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *orig, - struct bkey_s_c k, - u64 new_i_size) -{ - struct bkey_i *new; - struct bkey_ptrs ptrs; - struct bch_extent_ptr *ptr; - int ret; - - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { - /* trace this */ - return 0; - } - - new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bch2_cut_front(bkey_start_pos(&orig->k), new); - bch2_cut_back(orig->k.p, new); - - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) - ptr->unwritten = 0; - - /* - * Note that we're not calling bch2_subvol_get_snapshot() in this path - - * that was done when we kicked off the write, and here it's important - * that we update the extent that we wrote to - even if a snapshot has - * since been created. The write is still outstanding, so we're ok - * w.r.t. snapshot atomicity: - */ - return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: - bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter iter; - struct bkey_i *orig; - struct bkey_s_c k; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, ({ - bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); - })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *k = bch2_keylist_front(&op->insert_keys); - - bch_err_inum_offset_ratelimited(c, - k->k.p.inode, k->k.p.offset << 9, - "write error while doing btree update: %s", - bch2_err_str(ret)); - } - - if (ret) { - op->error = ret; - break; - } - } - - bch2_trans_exit(&trans); -} - -static void __bch2_nocow_write_done(struct bch_write_op *op) -{ - bch2_nocow_write_unlock(op); - - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) - bch2_nocow_write_convert_unwritten(op); -} - -static void bch2_nocow_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - - __bch2_nocow_write_done(op); - bch2_write_done(cl); -} - -static void bch2_nocow_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; - struct { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; - } buckets[BCH_REPLICAS_MAX]; - unsigned nr_buckets = 0; - u32 snapshot; - int ret, i; - - if (op->flags & BCH_WRITE_MOVE) - return; - - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); - if (unlikely(ret)) - goto err; - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); - while (1) { - struct bio *bio = &op->wbio.bio; - - nr_buckets = 0; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - /* fall back to normal cow write path? */ - if (unlikely(k.k->p.snapshot != snapshot || - !bch2_extent_is_writeable(op, k))) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - k.k->u64s)) - break; - - /* Get iorefs before dropping btree locks: */ - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); - buckets[nr_buckets].gen = ptr->gen; - buckets[nr_buckets].l = - bucket_nocow_lock(&c->nocow_locks, - bucket_to_u64(buckets[nr_buckets].b)); - - prefetch(buckets[nr_buckets].l); - - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - - nr_buckets++; - - if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; - } - - /* Unlock before taking nocow locks, doing IO: */ - bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(&trans); - - bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - for (i = 0; i < nr_buckets; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); - struct nocow_lock_bucket *l = buckets[i].l; - bool stale; - - __bch2_bucket_nocow_lock(&c->nocow_locks, l, - bucket_to_u64(buckets[i].b), - BUCKET_NOCOW_LOCK_UPDATE); - - rcu_read_lock(); - stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); - rcu_read_unlock(); - - if (unlikely(stale)) - goto err_bucket_stale; - } - - bio = &op->wbio.bio; - if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { - bio = bio_split(bio, k.k->p.offset - op->pos.offset, - GFP_KERNEL, &c->bio_write); - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { - op->flags |= BCH_WRITE_DONE; - } - - op->pos.offset += bio_sectors(bio); - op->written += bio_sectors(bio); - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - closure_get(&op->cl); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_DONE) - break; - bch2_btree_iter_advance(&iter); - } -out: - bch2_trans_iter_exit(&trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s: btree lookup error %s", - __func__, bch2_err_str(ret)); - op->error = ret; - op->flags |= BCH_WRITE_DONE; - } - - bch2_trans_exit(&trans); - - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_DONE)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl); - } else { - /* - * XXX - * needs to run out of process context because ei_quota_lock is - * a mutex - */ - continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); - } - return; -err_get_ioref: - for (i = 0; i < nr_buckets; i++) - percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); - - /* Fall back to COW path: */ - goto out; -err_bucket_stale: - while (--i >= 0) - bch2_bucket_nocow_unlock(&c->nocow_locks, - buckets[i].b, - BUCKET_NOCOW_LOCK_UPDATE); - for (i = 0; i < nr_buckets; i++) - percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); - - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; - goto out; -} - -static void __bch2_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct write_point *wp = NULL; - struct bio *bio = NULL; - unsigned nofs_flags; - int ret; - - nofs_flags = memalloc_nofs_save(); - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); - if (op->flags & BCH_WRITE_DONE) - goto out_nofs_restore; - } -again: - memset(&op->failed, 0, sizeof(op->failed)); - - do { - struct bkey_i *key_to_write; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - - /* +1 for possible cache device: */ - if (op->open_buckets.nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets.v)) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)) - break; - - /* - * The copygc thread is now global, which means it's no longer - * freeing up space on specific disks, which means that - * allocations for specific disks may hang arbitrarily long: - */ - ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_alloc_sectors_start_trans(&trans, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->watermark, - op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ? NULL : &op->cl, &wp)); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - - goto err; - } - - EBUG_ON(!wp); - - bch2_open_bucket_get(c, wp, &op->open_buckets); - ret = bch2_write_extent(op, wp, &bio); - - bch2_alloc_sectors_done_inlined(c, wp); -err: - if (ret <= 0) { - op->flags |= BCH_WRITE_DONE; - - if (ret < 0) { - op->error = ret; - break; - } - } - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - - closure_get(bio->bi_private); - - key_to_write = (void *) (op->insert_keys.keys_p + - key_to_write_offset); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - key_to_write, false); - } while (ret); - - /* - * Sync or no? - * - * If we're running asynchronously, wne may still want to block - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_DONE) && - !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_DONE)) - goto again; - bch2_write_done(&op->cl); - } else { - bch2_write_queue(op, wp); - continue_at(&op->cl, bch2_write_index, NULL); - } -out_nofs_restore: - memalloc_nofs_restore(nofs_flags); -} - -static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -{ - struct bio *bio = &op->wbio.bio; - struct bvec_iter iter; - struct bkey_i_inline_data *id; - unsigned sectors; - int ret; - - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_DONE; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - - ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_U64s + DIV_ROUND_UP(data_len, 8)); - if (ret) { - op->error = ret; - goto err; - } - - sectors = bio_sectors(bio); - op->pos.offset += sectors; - - id = bkey_inline_data_init(op->insert_keys.top); - id->k.p = op->pos; - id->k.version = op->version; - id->k.size = sectors; - - iter = bio->bi_iter; - iter.bi_size = data_len; - memcpy_from_bio(id->v.data, bio, iter); - - while (data_len & 7) - id->v.data[data_len++] = '\0'; - set_bkey_val_bytes(&id->k, data_len); - bch2_keylist_push(&op->insert_keys); - - __bch2_write_index(op); -err: - bch2_write_done(&op->cl); -} - -/** - * bch_write - handle a write to a cache device or flash only volume - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -void bch2_write(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->wbio.bio; - struct bch_fs *c = op->c; - unsigned data_len; - - EBUG_ON(op->cl.parent); - BUG_ON(!op->nr_replicas); - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - - op->start_time = local_clock(); - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - - if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "misaligned write"); - op->error = -EIO; - goto err; - } - - if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; - goto err; - } - - if (!(op->flags & BCH_WRITE_MOVE) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; - goto err; - } - - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, - op->new_i_size - (op->pos.offset << 9)); - - if (c->opts.inline_data && - data_len <= min(block_bytes(c) / 2, 1024U)) { - bch2_write_data_inline(op, data_len); - return; - } - - __bch2_write(op); - return; -err: - bch2_disk_reservation_put(c, &op->res); - - closure_debug_destroy(&op->cl); - if (op->end_io) - op->end_io(op); -} - -static const char * const bch2_write_flags[] = { -#define x(f) #f, - BCH_WRITE_FLAGS() -#undef x - NULL -}; - -void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) -{ - prt_str(out, "pos: "); - bch2_bpos_to_text(out, op->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_str(out, "started: "); - bch2_pr_time_units(out, local_clock() - op->start_time); - prt_newline(out); - - prt_str(out, "flags: "); - prt_bitflags(out, bch2_write_flags, op->flags); - prt_newline(out); - - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -/* Cache promotion on read */ - -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ -}; - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), -}; - -static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags) -{ - if (!(flags & BCH_READ_MAY_PROMOTE)) - return false; - - if (!opts.promote_target) - return false; - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return false; - - if (bkey_extent_is_unwritten(k)) - return false; - - if (bch2_target_congested(c, opts.promote_target)) { - /* XXX trace this */ - return false; - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return false; - - return true; -} - -static void promote_free(struct bch_fs *c, struct promote_op *op) -{ - int ret; - - bch2_data_update_exit(&op->write); - - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); -} - -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -{ - struct bio *bio = &op->write.op.wbio.bio; - - trace_and_count(op->write.op.c, read_promote, &rbio->bio); - - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - - bch2_data_update_read_done(&op->write, rbio->pick.crc); -} - -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio) -{ - struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - int ret; - - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; - - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) - goto err; - - op->start_time = local_clock(); - op->pos = pos; - - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) - goto err; - - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) - goto err; - - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) - goto err; - - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - ret = bch2_data_update_init(trans, NULL, &op->write, - writepoint_hashed((unsigned long) current), - opts, - (struct data_update_opts) { - .target = opts.promote_target, - .extra_replicas = 1, - .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, - }, - btree_id, k); - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - goto err; - } - - op->write.op.end_io = promote_done; - - return op; -err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; - kfree(op); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; -} - -noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) -{ - struct bch_fs *c = trans->c; - bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; - - if (!should_promote(c, k, pos, opts, flags)) - return NULL; - - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio); - if (!promote) - return NULL; - - *bounce = true; - *read_full = promote_full; - return promote; -} - -/* Read */ - -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (rbio->kmalloc) - kfree(rbio); - else - bio_put(&rbio->bio); - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); - bio_endio(&rbio->bio); -} - -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); -retry: - rbio->bio.bi_status = 0; - - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) - goto err; - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { - /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; - } - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - return; -err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - trace_and_count(c, read_retry, &rbio->bio); - - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); - - rbio->bio.bi_status = 0; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; - - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - __bch2_read(c, rbio, iter, inum, &failed, flags); - } -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) -{ - rbio->retry = retry; - - if (rbio->flags & BCH_READ_IN_RETRY) - return; - - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); - - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->version, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, - __bch2_rbio_narrow_crcs(&trans, rbio)); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; - - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; - } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); - bch2_io_error(ca); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; -decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -} - -static void bch2_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { - trace_and_count(c, read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -EIO; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); - - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); - printbuf_indent_add(&buf, 2); - prt_newline(&buf); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; - struct promote_op *promote = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - goto out_read_done; - } -retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); - - /* hole or reservation - just zero fill: */ - if (!pick_ret) - goto hole; - - if (pick_ret < 0) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from"); - goto err; - } - - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_IN_RETRY) && - !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); - goto retry_pick; - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(trans); - - if (flags & BCH_READ_NODECODE) { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) - goto hole; - - iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { - read_full = true; - bounce = true; - } - - if (orig->opts.promote_target) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - offset_into_extent = 0; - } -get_bio: - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig->opts); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig->opts); - rbio->bio.bi_iter = iter; - rbio->split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->c = c; - rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); - rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; - rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->version; - rbio->promote = promote; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); - - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); - } - - if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(c, rbio)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { - return 0; - } else { - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->retry; - rbio = bch2_rbio_free(rbio); - - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; - - return ret; - } - -err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; - - orig->bio.bi_status = BLK_STS_IOERR; - goto out_read_done; - -hole: - /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: - */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; -} - -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - u32 snapshot; - int ret; - - BUG_ON(flags & BCH_READ_NODECODE); - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); - while (1) { - unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(&trans); - if (ret) - break; - - bch2_btree_iter_set_pos(&iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(&trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min(sectors, k.k->size - offset_into_extent); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags); - if (ret) - break; - - if (flags & BCH_READ_LAST_FRAGMENT) - break; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(&trans); - if (ret) - break; - } -err: - bch2_trans_iter_exit(&trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - ret == READ_RETRY || - ret == READ_RETRY_AVOID) - goto retry; - - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - - if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); - } -} - -void bch2_fs_io_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); - mempool_exit(&c->bio_bounce_pages); - bioset_exit(&c->bio_write); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); -} - -int bch2_fs_io_init(struct bch_fs *c) -{ - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; - - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_write_init; - - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; - - return 0; -} diff --git a/libbcachefs/io.h b/libbcachefs/io.h deleted file mode 100644 index 1476380..0000000 --- a/libbcachefs/io.h +++ /dev/null @@ -1,202 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_H -#define _BCACHEFS_IO_H - -#include "checksum.h" -#include "bkey_buf.h" -#include "io_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -#define to_rbio(_bio) \ - container_of((_bio), struct bch_read_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -const char *bch2_blk_status_to_str(blk_status_t); - -#define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(DONE) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - unsigned, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -void bch2_write(struct closure *); - -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) -{ - if (k->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); -} - -enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); -} - -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - struct bch_io_failures failed = { .nr = 0 }; - - BUG_ON(rbio->_state); - - rbio->c = c; - rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; - return rbio; -} - -void bch2_fs_io_exit(struct bch_fs *); -int bch2_fs_io_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h deleted file mode 100644 index 737f16d..0000000 --- a/libbcachefs/io_types.h +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_TYPES_H -#define _BCACHEFS_IO_TYPES_H - -#include "alloc_types.h" -#include "btree_types.h" -#include "buckets_types.h" -#include "extents_types.h" -#include "keylist_types.h" -#include "opts.h" -#include "super_types.h" - -#include -#include - -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 bounce:1, - split:1, - kmalloc:1, - have_ioref:1, - narrow_crcs:1, - hole:1, - retry:2, - context:2; - }; - u16 _state; - }; - - struct bch_devs_list devs_have; - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct promote_op *promote; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - -struct bch_write_bio { - struct_group(wbio, - struct bch_fs *c; - struct bch_write_bio *parent; - - u64 submit_time; - u64 inode_offset; - - struct bch_devs_list failed; - u8 dev; - - unsigned split:1, - bounce:1, - put_bio:1, - have_ioref:1, - nocow:1, - used_mempool:1, - first_btree_write:1; - ); - - struct bio bio; -}; - -struct bch_write_op { - struct closure cl; - struct bch_fs *c; - void (*end_io)(struct bch_write_op *); - u64 start_time; - - unsigned written; /* sectors */ - u16 flags; - s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ - - unsigned compression_opt:8; - unsigned csum_type:4; - unsigned nr_replicas:4; - unsigned nr_replicas_required:4; - unsigned watermark:3; - unsigned incompressible:1; - unsigned stripe_waited:1; - - struct bch_devs_list devs_have; - u16 target; - u16 nonce; - struct bch_io_opts opts; - - u32 subvol; - struct bpos pos; - struct bversion version; - - /* For BCH_WRITE_DATA_ENCODED: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; - - struct write_point *wp; - struct list_head wp_list; - - struct disk_reservation res; - - struct open_buckets open_buckets; - - u64 new_i_size; - s64 i_sectors_delta; - - struct bch_devs_mask failed; - - struct keylist insert_keys; - u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; - - /* - * Bitmask of devices that have had nocow writes issued to them since - * last flush: - */ - struct bch_devs_mask *devs_need_flush; - - /* Must be last: */ - struct bch_write_bio wbio; -}; - -#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 80a612c..7d44813 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -63,6 +63,7 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { unsigned i; + for (i = 0; i < ARRAY_SIZE(p->list); i++) INIT_LIST_HEAD(&p->list[i]); INIT_LIST_HEAD(&p->flushed); @@ -131,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } -/* journal entry close/open: */ - -void __bch2_journal_buf_put(struct journal *j) +/* + * Final processing when the last reference of a journal buffer has been + * dropped. Drop the pin list reference acquired at journal entry open and write + * the buffer, if requested. + */ +void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + lockdep_assert_held(&j->lock); + + if (__bch2_journal_pin_put(j, seq)) + bch2_journal_reclaim_fast(j); + if (write) + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } /* @@ -203,13 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) buf->data->last_seq = cpu_to_le64(buf->last_seq); BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); - __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); - cancel_delayed_work(&j->write_work); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx); + __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) @@ -354,11 +361,6 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - if (j->res_get_blocked_start) - bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - mod_delayed_work(c->io_complete_wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); @@ -458,15 +460,12 @@ retry: __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ret = journal_entry_open(j); - if (ret == JOURNAL_ERR_max_in_flight) + if (ret == JOURNAL_ERR_max_in_flight) { + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, true); trace_and_count(c, journal_entry_full, c); -unlock: - if ((ret && ret != JOURNAL_ERR_insufficient_devices) && - !j->res_get_blocked_start) { - j->res_get_blocked_start = local_clock() ?: 1; - trace_and_count(c, journal_full, c); } - +unlock: can_discard = j->can_discard; spin_unlock(&j->lock); @@ -514,42 +513,11 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != - -BCH_ERR_journal_res_get_blocked|| + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } -/* journal_preres: */ - -static bool journal_preres_available(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - - if (!ret && mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - - return ret; -} - -int __bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - int ret; - - closure_wait_event(&j->preres_wait, - (ret = bch2_journal_error(j)) || - journal_preres_available(j, res, new_u64s, flags)); - return ret; -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -588,8 +556,13 @@ out: /** * bch2_journal_flush_seq_async - wait for a journal entry to be written + * @j: journal object + * @seq: seq to flush + * @parent: closure object to wait with + * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, + * -EIO if @seq will never be flushed * - * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, @@ -829,12 +802,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, + bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, ca->mi.bucket_size)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); - bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "marking new journal buckets"); break; } @@ -910,7 +883,7 @@ err_unblock: if (ret && !new_fs) for (i = 0; i < nr_got; i++) bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, + bch2_trans_mark_metadata_bucket(trans, ca, bu[i], BCH_DATA_free, 0)); err_free: if (!new_fs) @@ -944,7 +917,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, goto unlock; while (ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0 }; + struct disk_reservation disk_res = { 0, 0, 0 }; /* * note: journal buckets aren't really counted as _sectors_ used yet, so @@ -1008,6 +981,25 @@ err: return ret; } +int bch2_fs_journal_alloc(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + if (ca->journal.nr) + continue; + + int ret = bch2_dev_journal_alloc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + return 0; +} + /* startup/shutdown: */ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) @@ -1159,9 +1151,9 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = - bch2_sb_get_journal(sb); + bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = - bch2_sb_get_journal_v2(sb); + bch2_sb_field_get(sb, journal_v2); unsigned i, nr_bvecs; ja->nr = 0; @@ -1260,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) union journal_res_state s; struct bch_dev *ca; unsigned long now = jiffies; + u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; u64 seq; unsigned i; @@ -1273,21 +1266,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "average write size:\t"); + prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); + prt_newline(out); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 008a2e2..c85d01c 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *); - -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) +/* + * Drop reference on a buffer index and return true if the count has hit zero. + */ +static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; @@ -264,9 +265,30 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) .buf2_count = idx == 2, .buf3_count = idx == 3, }).v, &j->reservations.counter); + return s; +} + +void bch2_journal_buf_put_final(struct journal *, u64, bool); + +static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); +} + +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; - if (!journal_state_count(s, idx) && idx == s.unwritten_idx) - __bch2_journal_buf_put(j); + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) { + spin_lock(&j->lock); + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + spin_unlock(&j->lock); + } } /* @@ -286,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx); + bch2_journal_buf_put(j, res->idx, res->seq); res->ref = 0; } @@ -373,104 +395,6 @@ out: return 0; } -/* journal_preres: */ - -static inline void journal_set_watermark(struct journal *j) -{ - union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = BCH_WATERMARK_stripe; - - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (!s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static inline void bch2_journal_preres_put(struct journal *j, - struct journal_preres *res) -{ - union journal_preres_state s = { .reserved = res->u64s }; - - if (!res->u64s) - return; - - s.v = atomic64_sub_return(s.v, &j->prereserved.counter); - res->u64s = 0; - - if (unlikely(s.waiting)) { - clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), - (unsigned long *) &j->prereserved.v); - closure_wake_up(&j->preres_wait); - } - - if (s.reserved <= s.remaining && j->watermark) - journal_set_watermark(j); -} - -int __bch2_journal_preres_get(struct journal *, - struct journal_preres *, unsigned, unsigned); - -static inline int bch2_journal_preres_get_fast(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags, - bool set_waiting) -{ - int d = new_u64s - res->u64s; - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret; - - do { - old.v = new.v = v; - ret = 0; - - if (watermark == BCH_WATERMARK_reclaim || - new.reserved + d < new.remaining) { - new.reserved += d; - ret = 1; - } else if (set_waiting && !new.waiting) - new.waiting = true; - else - return 0; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); - - if (ret) - res->u64s += d; - return ret; -} - -static inline int bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - if (new_u64s <= res->u64s) - return 0; - - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) - return 0; - - if (flags & JOURNAL_RES_GET_NONBLOCK) - return -BCH_ERR_journal_preres_get_blocked; - - return __bch2_journal_preres_get(j, res, new_u64s, flags); -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, @@ -512,6 +436,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index f861ae2..109c115 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -8,12 +8,12 @@ #include "checksum.h" #include "disk_groups.h" #include "error.h" -#include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "replicas.h" +#include "sb-clean.h" #include "trace.h" static struct nonce journal_nonce(const struct jset *jset) @@ -140,7 +140,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, if (!dup->csum_good) goto replace; - fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", + fsck_err(c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); i = dup; goto found; @@ -208,38 +209,47 @@ static void journal_entry_null_range(void *start, void *end) #define JOURNAL_ENTRY_BAD 7 static void journal_entry_err_msg(struct printbuf *out, + u32 version, struct jset *jset, struct jset_entry *entry) { - prt_str(out, "invalid journal entry "); - if (entry) - prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); - - if (!jset) - prt_printf(out, "in superblock"); - else if (!entry) - prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); - else - prt_printf(out, "at offset %zi/%u seq %llu", - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s), - le64_to_cpu(jset->seq)); + prt_str(out, "invalid journal entry, version="); + bch2_version_to_text(out, version); + + if (entry) { + prt_str(out, " type="); + prt_str(out, bch2_jset_entry_types[entry->type]); + } + + if (!jset) { + prt_printf(out, " in superblock"); + } else { + + prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); + + if (entry) + prt_printf(out, " offset=%zi/%u", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s)); + } + prt_str(out, ": "); } -#define journal_entry_err(c, jset, entry, msg, ...) \ +#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ ({ \ - struct printbuf buf = PRINTBUF; \ + struct printbuf _buf = PRINTBUF; \ \ - journal_entry_err_msg(&buf, jset, entry); \ - prt_printf(&buf, msg, ##__VA_ARGS__); \ + journal_entry_err_msg(&_buf, version, jset, entry); \ + prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (write) { \ + switch (flags & BKEY_INVALID_WRITE) { \ case READ: \ - mustfix_fsck_err(c, "%s", buf.buf); \ + mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ + bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ if (bch2_fs_inconsistent(c)) { \ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ @@ -247,12 +257,12 @@ static void journal_entry_err_msg(struct printbuf *out, break; \ } \ \ - printbuf_exit(&buf); \ + printbuf_exit(&_buf); \ true; \ }) -#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ - ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) +#define journal_entry_err_on(cond, ...) \ + ((cond) ? journal_entry_err(__VA_ARGS__) : false) #define FSCK_DELETED_KEY 5 @@ -261,13 +271,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset_entry *entry, unsigned level, enum btree_id btree_id, struct bkey_i *k, - unsigned version, int big_endian, int write) + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { + int write = flags & BKEY_INVALID_WRITE; void *next = vstruct_next(entry); struct printbuf buf = PRINTBUF; int ret = 0; - if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { + if (journal_entry_err_on(!k->k.u64s, + c, version, jset, entry, + journal_entry_bkey_u64s_0, + "k->u64s 0")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; @@ -275,7 +290,8 @@ static int journal_validate_key(struct bch_fs *c, if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), - c, jset, entry, + c, version, jset, entry, + journal_entry_bkey_past_end, "extends past end of journal entry")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); @@ -283,7 +299,8 @@ static int journal_validate_key(struct bch_fs *c, } if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, - c, jset, entry, + c, version, jset, entry, + journal_entry_bkey_bad_format, "bad format %u", k->k.format)) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -298,11 +315,7 @@ static int journal_validate_key(struct bch_fs *c, if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id), write, &buf)) { printbuf_reset(&buf); - prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", - bch2_jset_entry_types[entry->type], - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s), - le64_to_cpu(jset->seq)); + journal_entry_err_msg(&buf, version, jset, entry); prt_newline(&buf); printbuf_indent_add(&buf, 2); @@ -311,7 +324,8 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id), write, &buf); - mustfix_fsck_err(c, "%s", buf.buf); + mustfix_fsck_err(c, journal_entry_bkey_invalid, + "%s", buf.buf); le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -330,9 +344,10 @@ fsck_err: } static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct bkey_i *k = entry->start; @@ -341,7 +356,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, entry->level, entry->btree_id, k, version, big_endian, - write|BKEY_INVALID_JOURNAL); + flags|BKEY_INVALID_JOURNAL); if (ret == FSCK_DELETED_KEY) continue; @@ -362,23 +377,25 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs prt_newline(out); prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); } - prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } } static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct bkey_i *k = entry->start; int ret = 0; if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, - c, jset, entry, + c, version, jset, entry, + journal_entry_btree_root_bad_size, "invalid btree root journal entry: wrong number of keys")) { void *next = vstruct_next(entry); /* @@ -392,7 +409,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, } return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, write); + version, big_endian, flags); fsck_err: return ret; } @@ -404,9 +421,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { /* obsolete, don't care: */ return 0; @@ -418,14 +436,16 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { int ret = 0; if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, - c, jset, entry, + c, version, jset, entry, + journal_entry_blacklist_bad_size, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -443,15 +463,17 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, - c, jset, entry, + c, version, jset, entry, + journal_entry_blacklist_v2_bad_size, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); goto out; @@ -461,7 +483,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > le64_to_cpu(bl_entry->end), - c, jset, entry, + c, version, jset, entry, + journal_entry_blacklist_v2_start_past_end, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -482,9 +505,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_ } static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -492,7 +516,8 @@ static int journal_entry_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u), - c, jset, entry, + c, version, jset, entry, + journal_entry_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -514,9 +539,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -525,7 +551,8 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u) || bytes < sizeof(*u) + u->r.nr_devs, - c, jset, entry, + c, version, jset, entry, + journal_entry_data_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -546,9 +573,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -556,13 +584,17 @@ static int journal_entry_clock_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes != sizeof(*clock), - c, jset, entry, "bad size")) { + c, version, jset, entry, + journal_entry_clock_bad_size, + "bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(clock->rw > 1, - c, jset, entry, "bad rw")) { + c, version, jset, entry, + journal_entry_clock_bad_rw, + "bad rw")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -581,9 +613,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); @@ -593,7 +626,9 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < expected, - c, jset, entry, "bad size (%u < %u)", + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", bytes, expected)) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -602,13 +637,17 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, dev = le32_to_cpu(u->dev); if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, jset, entry, "bad dev")) { + c, version, jset, entry, + journal_entry_dev_usage_bad_dev, + "bad dev")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(u->pad, - c, jset, entry, "bad pad")) { + c, version, jset, entry, + journal_entry_dev_usage_bad_pad, + "bad pad")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -641,9 +680,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return 0; } @@ -658,9 +698,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -674,7 +715,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, int); + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -691,11 +733,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - unsigned version, int big_endian, int write) + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, write) + version, big_endian, flags) : 0; } @@ -711,22 +754,23 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - int write) + enum bkey_invalid_flags flags) { struct jset_entry *entry; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { - if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(jset), c, jset, entry, + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), + c, version, jset, entry, + journal_entry_past_jset_end, "journal entry extends past end of jset")) { jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; } ret = bch2_journal_entry_validate(c, jset, entry, - le32_to_cpu(jset->version), - JSET_BIG_ENDIAN(jset), write); + version, JSET_BIG_ENDIAN(jset), flags); if (ret) break; } @@ -737,7 +781,7 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - int write) + enum bkey_invalid_flags flags) { unsigned version; int ret = 0; @@ -746,7 +790,9 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, "%s sector %llu seq %llu: incompatible journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -757,7 +803,8 @@ static int jset_validate(struct bch_fs *c, } if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, jset, NULL, + c, version, jset, NULL, + jset_unknown_csum, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -767,7 +814,8 @@ static int jset_validate(struct bch_fs *c, /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), - c, jset, NULL, + c, version, jset, NULL, + jset_last_seq_newer_than_seq, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(jset->last_seq), le64_to_cpu(jset->seq))) { @@ -775,7 +823,7 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_BAD; } - ret = jset_validate_entries(c, jset, write); + ret = jset_validate_entries(c, jset, flags); fsck_err: return ret; } @@ -788,14 +836,16 @@ static int jset_validate_early(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); unsigned version; - int write = READ; + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, "%s sector %llu seq %llu: unknown journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -810,7 +860,8 @@ static int jset_validate_early(struct bch_fs *c, return JOURNAL_ENTRY_REREAD; if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, jset, NULL, + c, version, jset, NULL, + jset_past_bucket_end, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), bytes)) @@ -879,7 +930,7 @@ reread: ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, + if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, "journal read error: sector %llu", offset) || bch2_meta_read_fault("journal")) { @@ -935,7 +986,8 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); csum_good = jset_csum_good(c, j); - if (!csum_good) + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, + "journal checksum error")) saw_bad = true; ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), @@ -1027,12 +1079,20 @@ found: if (ja->bucket_seq[ja->cur_idx] && ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); for (i = 0; i < 3; i++) { unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); } +#endif ja->sectors_free = 0; } @@ -1127,7 +1187,7 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - int write = READ; + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; i = *_i; @@ -1149,7 +1209,8 @@ int bch2_journal_read(struct bch_fs *c, } if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, &i->j, NULL, + c, le32_to_cpu(i->j.version), &i->j, NULL, + jset_last_seq_newer_than_seq, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(i->j.last_seq), le64_to_cpu(i->j.seq))) @@ -1166,7 +1227,8 @@ int bch2_journal_read(struct bch_fs *c, } if (!*last_seq) { - fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, + "journal read done, but no entries found after dropping non-flushes"); return 0; } @@ -1192,6 +1254,7 @@ int bch2_journal_read(struct bch_fs *c, if (bch2_journal_seq_is_blacklisted(c, seq, true)) { fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); i->ignore = true; } @@ -1232,7 +1295,8 @@ int bch2_journal_read(struct bch_fs *c, bch2_journal_ptrs_to_text(&buf2, c, i); missing_end = seq - 1; - fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + fsck_err(c, journal_entries_missing, + "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" " next at %s", missing_start, missing_end, @@ -1259,7 +1323,7 @@ int bch2_journal_read(struct bch_fs *c, continue; for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); if (!i->ptrs[ptr].csum_good) bch_err_dev_offset(ca, i->ptrs[ptr].sector, @@ -1281,18 +1345,15 @@ int bch2_journal_read(struct bch_fs *c, bch2_replicas_entry_sort(&replicas.e); - /* - * If we're mounting in degraded mode - if we didn't read all - * the devices - this is wrong: - */ - printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, &replicas.e); if (!degraded && - fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, - "superblock not marked as containing replicas %s", - buf.buf)) { + !bch2_replicas_marked(c, &replicas.e) && + (le64_to_cpu(i->j.seq) == *last_seq || + fsck_err(c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n %s", + le64_to_cpu(i->j.seq), buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) goto err; @@ -1361,16 +1422,21 @@ static void __journal_write_alloc(struct journal *j, } /** - * journal_next_bucket - move on to the next journal bucket if possible + * journal_write_alloc - decide where to write next journal entry + * + * @j: journal object + * @w: journal buf (entry to be written) + * + * Returns: 0 on success, or -EROFS on failure */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned sectors) +static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = @@ -1459,6 +1525,7 @@ static void journal_write_done(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; union journal_res_state old, new; u64 v, seq; int err = 0; @@ -1470,7 +1537,13 @@ static void journal_write_done(struct closure *cl) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); err = -EIO; + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; } + if (err) bch2_fatal_error(c); @@ -1519,11 +1592,15 @@ static void journal_write_done(struct closure *cl) bch2_journal_space_available(j); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, false); + closure_wake_up(&w->wait); journal_wake(j); if (!journal_state_count(new, new.unwritten_idx) && journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + spin_unlock(&j->lock); closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { @@ -1536,10 +1613,11 @@ static void journal_write_done(struct closure *cl) * might want to be written now: */ + spin_unlock(&j->lock); mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } else { + spin_unlock(&j->lock); } - - spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) @@ -1549,7 +1627,8 @@ static void journal_write_endio(struct bio *bio) struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; - if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { @@ -1607,12 +1686,17 @@ static void do_journal_write(struct closure *cl) } continue_at(cl, journal_write_done, c->io_complete_wq); - return; } -static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) +static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { - struct jset_entry *i, *next, *prev = NULL; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset *jset = w->data; + unsigned sectors, bytes, u64s; + bool validate_before_checksum = false; + unsigned long btree_roots_have = 0; + int ret; /* * Simple compaction, dropping empty jset_entries (from journal @@ -1629,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset if (!u64s) continue; - if (i->type == BCH_JSET_ENTRY_btree_root) + /* + * New btree roots are set by journalling them; when the journal + * entry gets written we have to propagate them to + * c->btree_roots + * + * But, every journal entry we write has to contain all the + * btree roots (at least for now); so after we copy btree roots + * to c->btree_roots we have to get any missing btree roots and + * add them to this journal entry: + */ + if (i->type == BCH_JSET_ENTRY_btree_root) { bch2_journal_entry_to_btree_root(c, i); + __set_bit(i->btree_id, &btree_roots_have); + } /* Can we merge with previous entry? */ if (prev && @@ -1654,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - -void bch2_journal_write(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_replicas_padded replicas; - struct jset_entry *start, *end; - struct jset *jset; - struct bio *bio; - struct printbuf journal_debug_buf = PRINTBUF; - bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s, nr_rw_members = 0; - int ret; - - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - journal_buf_realloc(j, w); - jset = w->data; - - j->write_start_time = local_clock(); - - spin_lock(&j->lock); - - /* - * If the journal is in an error state - we did an emergency shutdown - - * we prefer to continue doing journal writes. We just mark them as - * noflush so they'll never be used, but they'll still be visible by the - * list_journal tool - this helps in debugging. - * - * There's a caveat: the first journal write after marking the - * superblock dirty must always be a flush write, because on startup - * from a clean shutdown we didn't necessarily read the journal and the - * new journal write might overwrite whatever was in the journal - * previously - we can't leave the journal without any flush writes in - * it. - * - * So if we're in an error state, and we're still starting up, we don't - * write anything at all. - */ - if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && - (bch2_journal_error(j) || - w->noflush || - (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { - w->noflush = true; - SET_JSET_NO_FLUSH(jset, true); - jset->last_seq = 0; - w->last_seq = 0; - - j->nr_noflush_writes++; - } else if (!bch2_journal_error(j)) { - j->last_flush_write = jiffies; - j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); - } else { - spin_unlock(&j->lock); - goto err; - } - spin_unlock(&j->lock); - - /* - * New btree roots are set by journalling them; when the journal entry - * gets written we have to propagate them to c->btree_roots - * - * But, every journal entry we write has to contain all the btree roots - * (at least for now); so after we copy btree roots to c->btree_roots we - * have to get any missing btree roots and add them to this journal - * entry: - */ - - bch2_journal_entries_postprocess(c, jset); start = end = vstruct_last(jset); - end = bch2_btree_roots_to_journal_entries(c, jset->start, end); + end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); bch2_journal_super_entries_add_common(c, &end, le64_to_cpu(jset->seq)); @@ -1748,7 +1769,7 @@ void bch2_journal_write(struct closure *cl) bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", vstruct_bytes(jset), w->sectors << 9, u64s, w->u64s_reserved, j->entry_u64s_reserved); - goto err; + return -EINVAL; } jset->magic = cpu_to_le64(jset_magic(c)); @@ -1767,37 +1788,119 @@ void bch2_journal_write(struct closure *cl) validate_before_checksum = true; if (validate_before_checksum && - jset_validate(c, NULL, jset, 0, WRITE)) - goto err; + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); if (bch2_fs_fatal_err_on(ret, c, "error decrypting journal entry: %i", ret)) - goto err; + return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate(c, NULL, jset, 0, WRITE)) - goto err; + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + return 0; +} + +static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int error = bch2_journal_error(j); + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + return -EIO; + + if (error || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + w->noflush = true; + SET_JSET_NO_FLUSH(w->data, true); + w->data->last_seq = 0; + w->last_seq = 0; + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } + + return 0; +} + +void bch2_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; + struct bio *bio; + struct printbuf journal_debug_buf = PRINTBUF; + unsigned i, nr_rw_members = 0; + int ret; + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + j->write_start_time = local_clock(); -retry_alloc: spin_lock(&j->lock); - ret = journal_write_alloc(j, w, sectors); + ret = bch2_journal_write_pick_flush(j, w); + spin_unlock(&j->lock); + if (ret) + goto err; + + journal_buf_realloc(j, w); + + ret = bch2_journal_write_prep(j, w); + if (ret) + goto err; + + j->entry_bytes_written += vstruct_bytes(w->data); + + while (1) { + spin_lock(&j->lock); + ret = journal_write_alloc(j, w); + if (!ret || !j->can_discard) + break; - if (ret && j->can_discard) { spin_unlock(&j->lock); bch2_journal_do_discards(j); - goto retry_alloc; } - if (ret) + if (ret) { __bch2_journal_debug_to_text(&journal_debug_buf, j); + spin_unlock(&j->lock); + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); + goto err; + } /* * write is allocated, no longer need to account for it in @@ -1812,13 +1915,6 @@ retry_alloc: bch2_journal_space_available(j); spin_unlock(&j->lock); - if (ret) { - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); - goto err; - } - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); if (c->opts.nochanges) @@ -1840,7 +1936,7 @@ retry_alloc: if (ret) goto err; - if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(ca, c, i) { percpu_ref_get(&ca->io_ref); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 8801e98..a88d097 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, jset_entry_for_each_key(entry, k) int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, int); + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 8de83e1..8fa05be 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -3,13 +3,14 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "buckets.h" #include "errcode.h" #include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" -#include "super.h" +#include "sb-members.h" #include "trace.h" #include @@ -49,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +static inline void journal_set_watermark(struct journal *j) { - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - - do { - old.v = new.v = v; - new.remaining = u64s_remaining; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool low_on_space = j->space[journal_space_clean].total * 4 <= + j->space[journal_space_total].total; + bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; + unsigned watermark = low_on_space || low_on_pin + ? BCH_WATERMARK_reclaim + : BCH_WATERMARK_stripe; + + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], + &j->low_on_space_start, low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], + &j->low_on_pin_start, low_on_pin)) + trace_and_count(c, journal_full, c); + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static struct journal_space @@ -161,7 +171,6 @@ void bch2_journal_space_available(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned clean, clean_ondisk, total; - s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -221,16 +230,10 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - u64s_remaining = (u64) clean << 6; - u64s_remaining -= (u64) total << 3; - u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 4; - u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; - journal_set_remaining(j, u64s_remaining); - journal_set_watermark(j); if (!ret) journal_wake(j); @@ -289,9 +292,8 @@ void bch2_journal_do_discards(struct journal *j) * entry, holding it open to ensure it gets replayed during recovery: */ -static void bch2_journal_reclaim_fast(struct journal *j) +void bch2_journal_reclaim_fast(struct journal *j) { - struct journal_entry_pin_list temp; bool popped = false; lockdep_assert_held(&j->lock); @@ -302,7 +304,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) */ while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { - fifo_pop(&j->pin, temp); + j->pin.front++; popped = true; } @@ -310,19 +312,16 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } -void __bch2_journal_pin_put(struct journal *j, u64 seq) +bool __bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - if (atomic_dec_and_test(&pin_list->count)) - bch2_journal_reclaim_fast(j); + return atomic_dec_and_test(&pin_list->count); } void bch2_journal_pin_put(struct journal *j, u64 seq) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - if (atomic_dec_and_test(&pin_list->count)) { + if (__bch2_journal_pin_put(j, seq)) { spin_lock(&j->lock); bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); @@ -345,7 +344,7 @@ static inline bool __journal_pin_drop(struct journal *j, list_del_init(&pin->list); /* - * Unpinning a journal entry make make journal_next_bucket() succeed, if + * Unpinning a journal entry may make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: */ return atomic_dec_and_test(&pin_list->count) && @@ -372,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) return JOURNAL_PIN_other; } -void bch2_journal_pin_set(struct journal *j, u64 seq, +static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) + journal_pin_flush_fn flush_fn, + enum journal_pin_type type) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + /* + * flush_fn is how we identify journal pins in debugfs, so must always + * exist, even if it doesn't do anything: + */ + BUG_ON(!flush_fn); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + list_add(&pin->list, &pin_list->list[type]); +} + +void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list; bool reclaim; spin_lock(&j->lock); + u64 seq = READ_ONCE(src->seq); + if (seq < journal_last_seq(j)) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on @@ -392,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, return; } - pin_list = journal_seq_pin(j, seq); + reclaim = __journal_pin_drop(j, dst); - reclaim = __journal_pin_drop(j, pin); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); - if (flush_fn) - list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); - else - list_add(&pin->list, &pin_list->flushed); + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + bool reclaim; + + spin_lock(&j->lock); + + BUG_ON(seq < journal_last_seq(j)); + + reclaim = __journal_pin_drop(j, pin); + + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -418,6 +454,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, /** * bch2_journal_pin_flush: ensure journal pin callback is no longer running + * @j: journal object + * @pin: pin to flush */ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) { @@ -556,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - /* And include pre-reservations: */ - nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, - (ca->mi.bucket_size << 6) - - journal_entry_overhead(j)); - nr_buckets = min(nr_buckets, ja->nr); bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; @@ -578,7 +611,11 @@ static u64 journal_seq_to_flush(struct journal *j) } /** - * bch2_journal_reclaim - free up journal buckets + * __bch2_journal_reclaim - free up journal buckets + * @j: journal object + * @direct: direct or background reclaim? + * @kicked: requested to run since we last ran? + * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -635,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; - if (j->prereserved.reserved * 4 > j->prereserved.remaining) - min_nr = 1; - - if (fifo_free(&j->pin) <= 32) + if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) @@ -649,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - j->prereserved.reserved, - j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, atomic_long_read(&c->btree_key_cache.nr_dirty), @@ -757,7 +789,7 @@ int bch2_journal_reclaim_start(struct journal *j) "bch-reclaim/%s", c->name); ret = PTR_ERR_OR_ZERO(p); if (ret) { - bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating journal reclaim thread"); return ret; } @@ -802,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { + /* time_stats this */ bool did_work = false; if (!test_bit(JOURNAL_STARTED, &j->flags)) diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 0fd1af1..7b15d68 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } -void __bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_reclaim_fast(struct journal *); +bool __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); @@ -46,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, bch2_journal_pin_set(j, seq, pin, flush_fn); } -static inline void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - /* Guard against racing with journal_pin_drop(src): */ - u64 seq = READ_ONCE(src->seq); - - if (seq) - bch2_journal_pin_add(j, seq, dst, flush_fn); -} +void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_update(struct journal *j, u64 seq, struct journal_entry_pin *pin, diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index cc41bff..ae4fb8c 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -21,7 +21,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; unsigned nr; unsigned i; @@ -45,15 +45,15 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, goto err; } - if (b[0] < le16_to_cpu(m->first_bucket)) { + if (b[0] < le16_to_cpu(m.first_bucket)) { prt_printf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m->first_bucket)); + b[0], le16_to_cpu(m.first_bucket)); goto err; } - if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m->nbuckets)); + b[nr - 1], le64_to_cpu(m.nbuckets)); goto err; } @@ -104,7 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct printbuf *err) { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; unsigned nr; unsigned i; @@ -130,15 +130,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, goto err; } - if (b[0].start < le16_to_cpu(m->first_bucket)) { + if (b[0].start < le16_to_cpu(m.first_bucket)) { prt_printf(err, "journal bucket %llu before first bucket %u", - b[0].start, le16_to_cpu(m->first_bucket)); + b[0].start, le16_to_cpu(m.first_bucket)); goto err; } - if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); + b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); goto err; } @@ -194,7 +194,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, if (buckets[i] + 1 != buckets[i + 1]) nr_compacted++; - j = bch2_sb_resize_journal_v2(&ca->disk_sb, + j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) return -BCH_ERR_ENOSPC_sb_journal; diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index d6b9f2c..f9d9aa9 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -58,8 +58,8 @@ blacklist_entry_try_merge(struct bch_fs *c, &bl->start[i + 1], sizeof(bl->start[0]) * (nr - i)); - bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, - sb_blacklist_u64s(nr)); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + sb_blacklist_u64s(nr)); BUG_ON(!bl); } @@ -79,7 +79,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) int ret = 0; mutex_lock(&c->sb_lock); - bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); for (i = 0; i < nr; i++) { @@ -100,8 +100,8 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) } } - bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, - sb_blacklist_u64s(nr + 1)); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + sb_blacklist_u64s(nr + 1)); if (!bl) { ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; goto out; @@ -158,7 +158,7 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); @@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work) struct journal_seq_blacklist_table *t; struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); unsigned i, nr, new_nr; int ret; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < BTREE_ID_NR; i++) { struct btree_iter iter; struct btree *b; - bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); b = bch2_btree_iter_peek_node(&iter); @@ -275,15 +273,15 @@ retry: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return; mutex_lock(&c->sb_lock); - bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) goto out; @@ -308,7 +306,7 @@ retry: bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); if (new_nr != nr) { - bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, new_nr ? sb_blacklist_u64s(new_nr) : 0); BUG_ON(new_nr && !bl); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 42504e1..2427cce 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -76,14 +76,6 @@ struct journal_res { u64 seq; }; -/* - * For reserving space in the journal prior to getting a reservation on a - * particular journal entry: - */ -struct journal_preres { - unsigned u64s; -}; - union journal_res_state { struct { atomic64_t counter; @@ -104,22 +96,6 @@ union journal_res_state { }; }; -union journal_preres_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 waiting:1, - reserved:31, - remaining:32; - }; -}; - /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -180,8 +156,6 @@ struct journal { union journal_res_state reservations; enum bch_watermark watermark; - union journal_preres_state prereserved; - } __aligned(SMP_CACHE_BYTES); unsigned long flags; @@ -288,15 +262,18 @@ struct journal { unsigned long last_flush_write; - u64 res_get_blocked_start; u64 write_start_time; u64 nr_flush_writes; u64 nr_noflush_writes; + u64 entry_bytes_written; + + u64 low_on_space_start; + u64 low_on_pin_start; + u64 max_in_flight_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *blocked_time; struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 3e8b8f2..e6d081c 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -10,17 +10,17 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (!lru_pos_time(k.k->p)) { - prt_printf(err, "lru entry at time=0"); - return -BCH_ERR_invalid_bkey; - - } + int ret = 0; - return 0; + bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, + lru_entry_at_time_0, + "lru entry at time=0"); +fsck_err: + return ret; } void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, @@ -95,6 +95,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, int ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, + lru_entry_to_invalid_bucket, "lru key points to nonexistent device:bucket %llu:%llu", alloc_pos.inode, alloc_pos.offset)) return bch2_btree_delete_at(trans, lru_iter, 0); @@ -125,7 +126,8 @@ static int bch2_check_lru_key(struct btree_trans *trans, } if (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + fsck_err(c, lru_entry_bad, + "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", bch2_lru_types[type], @@ -151,10 +153,10 @@ int bch2_check_lrus(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos))); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); if (ret) bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index be66bf9..429dca8 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -48,7 +48,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 81c8cdb..8e5688d 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -10,7 +10,7 @@ #include "buckets.h" #include "errcode.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "keylist.h" #include "migrate.h" @@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; enum btree_id id; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for (id = 0; id < BTREE_ID_NR; id++) { if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct closure cl; struct btree *b; @@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; + trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); - bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { - bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; - while (bch2_trans_begin(&trans), + while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) @@ -141,15 +139,14 @@ retry: break; } - ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false); + ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { - bch_err(c, "Error updating btree node key: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "updating btree node key"); break; } next: @@ -158,7 +155,7 @@ next: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret) goto err; @@ -167,8 +164,8 @@ next: bch2_btree_interior_updates_flush(c); ret = 0; err: - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&k, c); + bch2_trans_put(trans); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 0527267..4f7d175 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -14,11 +14,13 @@ #include "errcode.h" #include "error.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "replicas.h" +#include "snapshot.h" #include "super-io.h" #include "trace.h" @@ -58,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c } } -static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_add(&stats->list, &c->data_progress_list); - mutex_unlock(&c->data_progress_lock); -} - -static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_del(&stats->list); - mutex_unlock(&c->data_progress_lock); -} - struct moving_io { struct list_head read_list; struct list_head io_list; @@ -155,35 +143,31 @@ static void move_read_endio(struct bio *bio) closure_put(&ctxt->cl); } -void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, - struct btree_trans *trans) +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) { struct moving_io *io; - if (trans) - bch2_trans_unlock(trans); - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + bch2_trans_unlock_long(ctxt->trans); list_del(&io->read_list); move_write(io); } } -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, - struct btree_trans *trans) +void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) { unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->write_sectors) || atomic_read(&ctxt->write_sectors) != sectors_pending); } void bch2_moving_ctxt_exit(struct moving_context *ctxt) { - struct bch_fs *c = ctxt->c; + struct bch_fs *c = ctxt->trans->c; - move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); EBUG_ON(atomic_read(&ctxt->write_sectors)); @@ -191,16 +175,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) EBUG_ON(atomic_read(&ctxt->read_sectors)); EBUG_ON(atomic_read(&ctxt->read_ios)); - if (ctxt->stats) { - progress_list_del(c, ctxt->stats); - trace_move_data(c, - atomic64_read(&ctxt->stats->sectors_moved), - atomic64_read(&ctxt->stats->keys_moved)); - } - mutex_lock(&c->moving_context_lock); list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); + + bch2_trans_put(ctxt->trans); + memset(ctxt, 0, sizeof(*ctxt)); } void bch2_moving_ctxt_init(struct moving_context *ctxt, @@ -212,7 +192,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, { memset(ctxt, 0, sizeof(*ctxt)); - ctxt->c = c; + ctxt->trans = bch2_trans_get(c); ctxt->fn = (void *) _RET_IP_; ctxt->rate = rate; ctxt->stats = stats; @@ -229,16 +209,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, mutex_lock(&c->moving_context_lock); list_add(&ctxt->list, &c->moving_context_list); mutex_unlock(&c->moving_context_lock); +} - if (stats) { - progress_list_add(c, stats); - stats->data_type = BCH_DATA_user; - } +void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) +{ + trace_move_data(c, stats); } void bch2_move_stats_init(struct bch_move_stats *stats, char *name) { memset(stats, 0, sizeof(*stats)); + stats->data_type = BCH_DATA_user; scnprintf(stats->name, sizeof(stats->name), "%s", name); } @@ -282,18 +263,17 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, return bch2_trans_relock(trans) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -static int bch2_move_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bch_io_opts io_opts, - enum btree_id btree_id, - struct bkey_s_c k, - struct data_update_opts data_opts) +int bch2_move_extent(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_io_opts io_opts, + struct data_update_opts data_opts) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct moving_io *io; @@ -302,6 +282,8 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -354,7 +336,7 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, btree_id, k); + io_opts, data_opts, iter->btree_id, k); if (ret && ret != -BCH_ERR_unwritten_extent_update) goto err_free_pages; @@ -366,9 +348,11 @@ static int bch2_move_extent(struct btree_trans *trans, BUG_ON(ret); - io->write.ctxt = ctxt; io->write.op.end_io = move_write_done; + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + if (ctxt->stats) { atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); @@ -398,7 +382,7 @@ static int bch2_move_extent(struct btree_trans *trans, closure_get(&ctxt->cl); bch2_read_extent(trans, &io->rbio, bkey_start_pos(k.k), - btree_id, k, 0, + iter->btree_id, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -412,45 +396,96 @@ err: return ret; } -static int lookup_inode(struct btree_trans *trans, struct bpos pos, - struct bch_inode_unpacked *inode) +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (io_opts->cur_inum != extent_k.k->p.inode) { + struct btree_iter iter; + struct bkey_s_c k; + + io_opts->d.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != extent_k.k->p.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get(&e.io_opts, trans->c, &inode); + + ret = darray_push(&io_opts->d, e); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + io_opts->cur_inum = extent_k.k->p.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) { + struct snapshot_io_opts_entry *i; + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + } + + return &io_opts->fs_io_opts; +} + +int bch2_move_get_io_opts_one(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c extent_k) { struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(&iter); + /* reflink btree? */ + if (!extent_k.k->p.inode) { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + return 0; + } + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_CACHED); ret = bkey_err(k); - if (ret) - goto err; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; - if (!k.k || !bkey_eq(k.k->p, pos)) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + if (!ret && bkey_is_inode(k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(k, &inode); + bch2_inode_opts_get(io_opts, trans->c, &inode); + } else { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); } - ret = bkey_is_inode(k.k) ? 0 : -EIO; - if (ret) - goto err; - - ret = bch2_inode_unpack(k, inode); - if (ret) - goto err; -err: bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -static int move_ratelimit(struct btree_trans *trans, - struct moving_context *ctxt) +int bch2_move_ratelimit(struct moving_context *ctxt) { - struct bch_fs *c = trans->c; + struct bch_fs *c = ctxt->trans->c; u64 delay; - if (ctxt->wait_on_copygc) { - bch2_trans_unlock(trans); + if (ctxt->wait_on_copygc && !c->copygc_running) { + bch2_trans_unlock_long(ctxt->trans); wait_event_killable(c->copygc_running_wq, !c->copygc_running || kthread_should_stop()); @@ -459,8 +494,12 @@ static int move_ratelimit(struct btree_trans *trans, do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + if (delay) { - bch2_trans_unlock(trans); + if (delay > HZ / 10) + bch2_trans_unlock_long(ctxt->trans); + else + bch2_trans_unlock(ctxt->trans); set_current_state(TASK_INTERRUPTIBLE); } @@ -473,7 +512,7 @@ static int move_ratelimit(struct btree_trans *trans, schedule_timeout(delay); if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -482,7 +521,7 @@ static int move_ratelimit(struct btree_trans *trans, * XXX: these limits really ought to be per device, SSDs and hard drives * will want different limits */ - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && @@ -491,64 +530,39 @@ static int move_ratelimit(struct btree_trans *trans, return 0; } -static int move_get_io_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct bkey_s_c k, u64 *cur_inum) -{ - struct bch_inode_unpacked inode; - int ret; - - if (*cur_inum == k.k->p.inode) - return 0; - - ret = lookup_inode(trans, - SPOS(0, k.k->p.inode, k.k->p.snapshot), - &inode); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret) - bch2_inode_opts_get(io_opts, trans->c, &inode); - else - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - *cur_inum = k.k->p.inode; - return 0; -} - -static int __bch2_move_data(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +static int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) { - struct bch_fs *c = ctxt->c; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct per_snapshot_io_opts snapshot_io_opts; + struct bch_io_opts *io_opts; struct bkey_buf sk; - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct data_update_opts data_opts; - u64 cur_inum = U64_MAX; int ret = 0, ret2; + per_snapshot_io_opts_init(&snapshot_io_opts, c); bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); if (ctxt->stats) { ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->btree_id = btree_id; - ctxt->stats->pos = start; + ctxt->stats->pos = BBPOS(btree_id, start); } - bch2_trans_iter_init(&trans, &iter, btree_id, start, + bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); - while (!move_ratelimit(&trans, ctxt)) { - bch2_trans_begin(&trans); + while (!bch2_move_ratelimit(ctxt)) { + bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); if (!k.k) @@ -564,17 +578,18 @@ static int __bch2_move_data(struct moving_context *ctxt, break; if (ctxt->stats) - ctxt->stats->pos = iter.pos; + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, &io_opts, &data_opts)) + if (!pred(c, arg, k, io_opts, &data_opts)) goto next; /* @@ -584,24 +599,20 @@ static int __bch2_move_data(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL, - io_opts, btree_id, k, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, &trans); + bch2_move_ctxt_wait_for_io(ctxt); continue; } /* XXX signal failure */ goto next; } - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); @@ -609,60 +620,69 @@ next_nondata: bch2_btree_iter_advance(&iter); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); + per_snapshot_io_opts_exit(&snapshot_io_opts); return ret; } -int bch2_move_data(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int __bch2_move_data(struct moving_context *ctxt, + struct bbpos start, + struct bbpos end, + move_pred_fn pred, void *arg) { - struct moving_context ctxt; + struct bch_fs *c = ctxt->trans->c; enum btree_id id; - int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + int ret = 0; - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + for (id = start.btree; + id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); id++) { - stats->btree_id = id; - - if (id != BTREE_ID_extents && - id != BTREE_ID_reflink) - continue; + ctxt->stats->pos = BBPOS(id, POS_MIN); - if (!bch2_btree_id_root(c, id)->b) + if (!btree_type_has_ptrs(id) || + !bch2_btree_id_root(c, id)->b) continue; - ret = __bch2_move_data(&ctxt, - id == start_btree_id ? start_pos : POS_MIN, - id == end_btree_id ? end_pos : POS_MAX, + ret = bch2_move_data_btree(ctxt, + id == start.btree ? start.pos : POS_MIN, + id == end.btree ? end.pos : POS_MAX, pred, arg, id); if (ret) break; } + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bbpos start, + struct bbpos end, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int __bch2_evacuate_bucket(struct btree_trans *trans, - struct moving_context *ctxt, +int __bch2_evacuate_bucket(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) { - struct bch_fs *c = ctxt->c; + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; struct bkey_buf sk; @@ -673,7 +693,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, struct data_update_opts data_opts; unsigned dirty_sectors, bucket_size; u64 fragmentation; - u64 cur_inum = U64_MAX; struct bpos bp_pos = POS_MIN; int ret = 0; @@ -708,7 +727,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, goto err; } - while (!(ret = move_ratelimit(trans, ctxt))) { + while (!(ret = bch2_move_ratelimit(ctxt))) { bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, @@ -723,7 +742,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, if (!bp.level) { const struct bch_extent_ptr *ptr; - struct bkey_s_c k; unsigned i = 0; k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); @@ -738,7 +756,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; @@ -759,23 +777,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, i++; } - ret = bch2_move_extent(trans, &iter, ctxt, - bucket_in_flight, - io_opts, bp.btree_id, k, data_opts); + ret = bch2_move_extent(ctxt, bucket_in_flight, + &iter, k, io_opts, data_opts); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, trans); + bch2_move_ctxt_wait_for_io(ctxt); continue; } if (ret) goto err; - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); } else { @@ -826,15 +841,12 @@ int bch2_evacuate_bucket(struct bch_fs *c, struct write_point_specifier wp, bool wait_on_copygc) { - struct btree_trans trans; struct moving_context ctxt; int ret; - bch2_trans_init(&trans, c, 0, 0); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts); + ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); bch2_moving_ctxt_exit(&ctxt); - bch2_trans_exit(&trans); return ret; } @@ -851,31 +863,34 @@ static int bch2_move_btree(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_trans trans; + struct moving_context ctxt; + struct btree_trans *trans; struct btree_iter iter; struct btree *b; enum btree_id id; struct data_update_opts data_opts; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - progress_list_add(c, stats); + bch2_moving_ctxt_init(&ctxt, c, NULL, stats, + writepoint_ptr(&c->btree_write_point), + true); + trans = ctxt.trans; stats->data_type = BCH_DATA_btree; for (id = start_btree_id; id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); id++) { - stats->btree_id = id; + stats->pos = BBPOS(id, POS_MIN); if (!bch2_btree_id_root(c, id)->b) continue; - bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; - while (bch2_trans_begin(&trans), + while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) @@ -885,12 +900,12 @@ retry: bpos_cmp(b->key.k.p, end_pos)) > 0) break; - stats->pos = iter.pos; + stats->pos = BBPOS(iter.btree_id, iter.pos); if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -901,20 +916,16 @@ next: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (kthread && kthread_should_stop()) break; } - bch2_trans_exit(&trans); - - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); + bch2_moving_ctxt_exit(&ctxt); bch2_btree_interior_updates_flush(c); - progress_list_del(c, stats); return ret; } @@ -1035,8 +1046,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) mutex_unlock(&c->sb_lock); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1059,14 +1069,16 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); break; case BCH_DATA_OP_MIGRATE: if (op.migrate.dev >= c->sb.nr_devices) @@ -1083,18 +1095,21 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); break; case BCH_DATA_OP_REWRITE_OLD_NODES: bch2_move_stats_init(stats, "rewrite_old_nodes"); ret = bch2_scan_old_btree_nodes(c, stats); + bch2_move_stats_exit(stats, c); break; default: ret = -EINVAL; @@ -1103,46 +1118,64 @@ int bch2_data_job(struct bch_fs *c, return ret; } -void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - struct bch_move_stats *stats; - - mutex_lock(&c->data_progress_lock); - list_for_each_entry(stats, &c->data_progress_list, list) { - prt_printf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - prt_printf(out, "%s", "\n"); - } - mutex_unlock(&c->data_progress_lock); + prt_printf(out, "%s: data type=%s pos=", + stats->name, + bch2_data_types[stats->data_type]); + bch2_bbpos_to_text(out, stats->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "keys moved: "); + prt_u64(out, atomic64_read(&stats->keys_moved)); + prt_newline(out); + + prt_str(out, "keys raced: "); + prt_u64(out, atomic64_read(&stats->keys_raced)); + prt_newline(out); + + prt_str(out, "bytes seen: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); + prt_newline(out); + + prt_str(out, "bytes moved: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); + prt_newline(out); + + prt_str(out, "bytes raced: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); + prt_newline(out); + + printbuf_indent_sub(out, 2); } -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { struct moving_io *io; - prt_printf(out, "%ps:", ctxt->fn); - prt_newline(out); + bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); - prt_printf(out, "reads: %u sectors %u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u", atomic_read(&ctxt->read_ios), - atomic_read(&ctxt->read_sectors)); + c->opts.move_ios_in_flight, + atomic_read(&ctxt->read_sectors), + c->opts.move_bytes_in_flight >> 9); prt_newline(out); - prt_printf(out, "writes: %u sectors %u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u", atomic_read(&ctxt->write_ios), - atomic_read(&ctxt->write_sectors)); + c->opts.move_ios_in_flight, + atomic_read(&ctxt->write_sectors), + c->opts.move_bytes_in_flight >> 9); prt_newline(out); printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); - list_for_each_entry(io, &ctxt->ios, io_list) { + list_for_each_entry(io, &ctxt->ios, io_list) bch2_write_op_to_text(out, &io->write.op); - } mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); @@ -1154,7 +1187,7 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->moving_context_lock); list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, ctxt); + bch2_moving_ctxt_to_text(out, c, ctxt); mutex_unlock(&c->moving_context_lock); } @@ -1162,7 +1195,4 @@ void bch2_fs_move_init(struct bch_fs *c) { INIT_LIST_HEAD(&c->moving_context_list); mutex_init(&c->moving_context_lock); - - INIT_LIST_HEAD(&c->data_progress_list); - mutex_init(&c->data_progress_lock); } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 547ee7b..07cf9d4 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H +#include "bbpos.h" +#include "bcachefs_ioctl.h" #include "btree_iter.h" #include "buckets.h" #include "data_update.h" @@ -10,7 +12,7 @@ struct bch_read_bio; struct moving_context { - struct bch_fs *c; + struct btree_trans *trans; struct list_head list; void *fn; @@ -36,13 +38,14 @@ struct moving_context { wait_queue_head_t wait; }; -#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +#define move_ctxt_wait_event(_ctxt, _cond) \ do { \ bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ \ if (_cond) \ break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ __wait_event((_ctxt)->wait, \ bch2_moving_ctxt_next_pending_write(_ctxt) || \ (cond_finished = (_cond))); \ @@ -58,22 +61,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -void bch2_moving_ctxt_do_pending_writes(struct moving_context *, - struct btree_trans *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *); +void bch2_move_ctxt_wait_for_io(struct moving_context *); +int bch2_move_ratelimit(struct moving_context *); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_io_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_io_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); +int bch2_move_extent(struct moving_context *, + struct move_bucket_in_flight *, + struct btree_iter *, + struct bkey_s_c, + struct bch_io_opts, + struct data_update_opts); + +int __bch2_move_data(struct moving_context *, + struct bbpos, + struct bbpos, + move_pred_fn, void *); int bch2_move_data(struct bch_fs *, - enum btree_id, struct bpos, - enum btree_id, struct bpos, + struct bbpos start, + struct bbpos end, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct btree_trans *, - struct moving_context *, +int __bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); @@ -87,8 +128,10 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); -void bch2_move_stats_init(struct bch_move_stats *stats, char *name); -void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); +void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); +void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); +void bch2_move_stats_init(struct bch_move_stats *, char *); + void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_move_init(struct bch_fs *); diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index baf1f85..e22841e 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -2,17 +2,17 @@ #ifndef _BCACHEFS_MOVE_TYPES_H #define _BCACHEFS_MOVE_TYPES_H +#include "bbpos_types.h" + struct bch_move_stats { enum bch_data_type data_type; - enum btree_id btree_id; - struct bpos pos; - struct list_head list; + struct bbpos pos; char name[32]; atomic64_t keys_moved; atomic64_t keys_raced; - atomic64_t sectors_moved; atomic64_t sectors_seen; + atomic64_t sectors_moved; atomic64_t sectors_raced; }; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 5242f20..0a05763 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -13,25 +13,17 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" -#include "disk_groups.h" #include "errcode.h" #include "error.h" -#include "extents.h" -#include "eytzinger.h" -#include "io.h" -#include "keylist.h" #include "lru.h" #include "move.h" #include "movinggc.h" -#include "super-io.h" #include "trace.h" -#include #include #include #include #include -#include #include struct buckets_in_flight { @@ -109,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return ret; } -static void move_buckets_wait(struct btree_trans *trans, - struct moving_context *ctxt, +static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { @@ -119,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans, while ((i = list->first)) { if (flush) - move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); + move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); if (atomic_read(&i->count)) break; @@ -137,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans, kfree(i); } - bch2_trans_unlock(trans); + bch2_trans_unlock_long(ctxt->trans); } static bool bucket_in_flight(struct buckets_in_flight *list, @@ -148,19 +139,19 @@ static bool bucket_in_flight(struct buckets_in_flight *list, typedef DARRAY(struct move_bucket) move_buckets; -static int bch2_copygc_get_buckets(struct btree_trans *trans, - struct moving_context *ctxt, +static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight, move_buckets *buckets) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; - move_buckets_wait(trans, ctxt, buckets_in_flight, false); + move_buckets_wait(ctxt, buckets_in_flight, false); ret = bch2_btree_write_buffer_flush(trans); if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", @@ -172,7 +163,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret = 0; + int ret2 = 0; saw++; @@ -181,11 +172,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; - if (ret >= 0) + ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret2 >= 0) sectors += b.sectors; } - ret; + ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", @@ -196,10 +187,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, } noinline -static int bch2_copygc(struct btree_trans *trans, - struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) +static int bch2_copygc(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + bool *did_work) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, @@ -210,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans, u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); if (ret) goto err; @@ -220,17 +212,21 @@ static int bch2_copygc(struct btree_trans *trans, f = move_bucket_in_flight_add(buckets_in_flight, *i); ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ + if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ + ret = 0; continue; + } if (ret == -ENOMEM) { /* flush IO, continue later */ ret = 0; break; } - ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, + ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, f->bucket.k.gen, data_opts); if (ret) goto err; + + *did_work = true; } err: darray_exit(&buckets); @@ -240,7 +236,7 @@ err: ret = 0; if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "from bch2_move_data()"); moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; trace_and_count(c, copygc, c, moved, 0, 0, 0); @@ -306,25 +302,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; - struct btree_trans trans; struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight move_buckets; + struct buckets_in_flight *buckets; u64 last, wait; int ret = 0; - memset(&move_buckets, 0, sizeof(move_buckets)); - - ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); + buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); + if (!buckets) + return -ENOMEM; + ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); if (ret) { - bch_err(c, "error allocating copygc buckets in flight: %s", - bch2_err_str(ret)); + kfree(buckets); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } set_freezable(); - bch2_trans_init(&trans, c, 0, 0); bch2_move_stats_init(&move_stats, "copygc"); bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, @@ -332,16 +327,18 @@ static int bch2_copygc_thread(void *arg) false); while (!ret && !kthread_should_stop()) { - bch2_trans_unlock(&trans); + bool did_work = false; + + bch2_trans_unlock_long(ctxt.trans); cond_resched(); if (!c->copy_gc_enabled) { - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(&ctxt, buckets, true); kthread_wait_freezable(c->copy_gc_enabled); } if (unlikely(freezing(current))) { - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(&ctxt, buckets, true); __refrigerator(false); continue; } @@ -352,7 +349,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(&ctxt, buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -362,16 +359,29 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&trans, &ctxt, &move_buckets); + ret = bch2_copygc(&ctxt, buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); + + if (!wait && !did_work) { + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + bch2_trans_unlock_long(ctxt.trans); + bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), + MAX_SCHEDULE_TIMEOUT); + } } - move_buckets_wait(&trans, &ctxt, &move_buckets, true); - rhashtable_destroy(&move_buckets.table); - bch2_trans_exit(&trans); + move_buckets_wait(&ctxt, buckets, true); + + rhashtable_destroy(&buckets->table); + kfree(buckets); bch2_moving_ctxt_exit(&ctxt); + bch2_move_stats_exit(&move_stats, c); return 0; } @@ -402,7 +412,7 @@ int bch2_copygc_start(struct bch_fs *c) t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating copygc thread"); return ret; } diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index 396357c..3c21981 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_methods.h" #include "nocow_locking.h" #include "util.h" @@ -29,9 +30,10 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc for (i = 0; i < ARRAY_SIZE(l->b); i++) if (l->b[i] == dev_bucket) { - BUG_ON(sign(atomic_read(&l->l[i])) != lock_val); + int v = atomic_sub_return(lock_val, &l->l[i]); - if (!atomic_sub_return(lock_val, &l->l[i])) + BUG_ON(v && sign(v) != lock_val); + if (!v) closure_wake_up(&l->wait); return; } @@ -64,6 +66,11 @@ got_entry: if (lock_val > 0 ? v < 0 : v > 0) goto fail; take_lock: + v = atomic_read(&l->l[i]); + /* Overflow? */ + if (v && sign(v + lock_val) != sign(v)) + goto fail; + atomic_add(lock_val, &l->l[i]); spin_unlock(&l->lock); return true; @@ -83,6 +90,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, } void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) + { unsigned i, nr_zero = 0; struct nocow_lock_bucket *l; @@ -102,9 +110,13 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab prt_printf(out, "(%u empty entries)\n", nr_zero); nr_zero = 0; - for (i = 0; i < ARRAY_SIZE(l->l); i++) - if (atomic_read(&l->l[i])) - prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i])); + for (i = 0; i < ARRAY_SIZE(l->l); i++) { + int v = atomic_read(&l->l[i]); + if (v) { + bch2_bpos_to_text(out, u64_to_bucket(l->b[i])); + prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v)); + } + } prt_newline(out); } @@ -112,12 +124,21 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab prt_printf(out, "(%u empty entries)\n", nr_zero); } +void bch2_fs_nocow_locking_exit(struct bch_fs *c) +{ + struct bucket_nocow_lock_table *t = &c->nocow_locks; + + for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) + for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++) + BUG_ON(atomic_read(&l->l[j])); +} + int bch2_fs_nocow_locking_init(struct bch_fs *c) { - unsigned i; + struct bucket_nocow_lock_table *t = &c->nocow_locks; - for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) - spin_lock_init(&c->nocow_locks.l[i].lock); + for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) + spin_lock_init(&l->lock); return 0; } diff --git a/libbcachefs/nocow_locking.h b/libbcachefs/nocow_locking.h index ff8e4af..f9d6a42 100644 --- a/libbcachefs/nocow_locking.h +++ b/libbcachefs/nocow_locking.h @@ -44,6 +44,7 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); +void bch2_fs_nocow_locking_exit(struct bch_fs *); int bch2_fs_nocow_locking_init(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 960bb24..8dd4046 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -37,9 +37,8 @@ const char * const bch2_sb_compat[] = { NULL }; -const char * const bch2_btree_ids[] = { +const char * const __bch2_btree_ids[] = { BCH_BTREE_IDS() - "interior btree node", NULL }; @@ -266,14 +265,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: too small (min %llu)", opt->attr.name, opt->min); - return -ERANGE; + return -BCH_ERR_ERANGE_option_too_small; } if (opt->max && v >= opt->max) { if (err) prt_printf(err, "%s: too big (max %llu)", opt->attr.name, opt->max); - return -ERANGE; + return -BCH_ERR_ERANGE_option_too_big; } if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { @@ -290,6 +289,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) return -EINVAL; } + if (opt->fn.validate) + return opt->fn.validate(v, err); + return 0; } @@ -471,8 +473,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, val = "0"; } + /* Unknown options are ignored: */ if (id < 0) - goto bad_opt; + continue; if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8a9db11..8526f17 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -15,7 +15,7 @@ extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; -extern const char * const bch2_btree_ids[]; +extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_types[]; @@ -73,6 +73,7 @@ enum opt_type { struct bch_opt_fn { int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + int (*validate)(u64, struct printbuf *); }; /** @@ -469,7 +470,7 @@ struct bch_opts { #undef x }; -static const struct bch_opts bch2_opts_default = { +static const __maybe_unused struct bch_opts bch2_opts_default = { #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ ._name = _default, \ diff --git a/libbcachefs/printbuf.c b/libbcachefs/printbuf.c index c41daa1..5e653eb 100644 --- a/libbcachefs/printbuf.c +++ b/libbcachefs/printbuf.c @@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) } /** - * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null - * terminated + * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be + * null terminated + * @buf: printbuf to terminate + * Returns: Printbuf contents, as a nul terminated C string */ const char *bch2_printbuf_str(const struct printbuf *buf) { @@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf) } /** - * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it + * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it * against accidental use. + * @buf: printbuf to exit */ void bch2_printbuf_exit(struct printbuf *buf) { @@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf) } /* - * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop + * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop * * @buf: printbuf to control * @spaces: number of spaces from previous tabpstop @@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) } /** - * printbuf_indent_add - add to the current indent level + * bch2_printbuf_indent_add() - add to the current indent level * * @buf: printbuf to control * @spaces: number of spaces to add to the current indent level @@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) } /** - * printbuf_indent_sub - subtract from the current indent level + * bch2_printbuf_indent_sub() - subtract from the current indent level * * @buf: printbuf to control * @spaces: number of spaces to subtract from the current indent level @@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out) } /** - * prt_tab - Advance printbuf to the next tabstop - * - * @buf: printbuf to control + * bch2_prt_tab() - Advance printbuf to the next tabstop + * @out: printbuf to control * * Advance output to the next tabstop by printing spaces. */ @@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf) } /** - * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying + * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying * previous output * * @buf: printbuf to control @@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf) } /** - * prt_bytes_indented - Print an array of chars, handling embedded control characters + * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters * - * @out: printbuf to output to - * @str: string to print - * @count: number of bytes to print + * @out: output printbuf + * @str: string to print + * @count: number of bytes to print * * The following contol characters are handled as so: * \n: prt_newline newline that obeys current indent level @@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou } /** - * prt_human_readable_u64 - Print out a u64 in human readable units + * bch2_prt_human_readable_u64() - Print out a u64 in human readable units + * @out: output printbuf + * @v: integer to print * - * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ -void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) +void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { - bch2_printbuf_make_room(buf, 10); - buf->pos += string_get_size(v, 1, !buf->si_units, - buf->buf + buf->pos, - printbuf_remaining_size(buf)); + bch2_printbuf_make_room(out, 10); + out->pos += string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); } /** - * prt_human_readable_s64 - Print out a s64 in human readable units + * bch2_prt_human_readable_s64() - Print out a s64 in human readable units + * @out: output printbuf + * @v: integer to print * - * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ -void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) +void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) { if (v < 0) - prt_char(buf, '-'); - bch2_prt_human_readable_u64(buf, abs(v)); + prt_char(out, '-'); + bch2_prt_human_readable_u64(out, abs(v)); } /** - * prt_units_u64 - Print out a u64 according to printbuf unit options + * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) @@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v) } /** - * prt_units_s64 - Print out a s64 according to printbuf unit options + * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) @@ -405,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out, while (list[nr]) nr++; - while (flags && (bit = __ffs(flags)) < nr) { + while (flags && (bit = __ffs64(flags)) < nr) { if (!first) bch2_prt_printf(out, ","); first = false; bch2_prt_printf(out, "%s", list[bit]); - flags ^= 1 << bit; + flags ^= BIT_ULL(bit); } } diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 4f0654f..a54647c 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -5,7 +5,7 @@ #include "error.h" #include "inode.h" #include "quota.h" -#include "subvolume.h" +#include "snapshot.h" #include "super-io.h" static const char * const bch2_quota_types[] = { @@ -59,17 +59,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->p.inode >= QTYP_NR) { - prt_printf(err, "invalid quota type (%llu >= %u)", - k.k->p.inode, QTYP_NR); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, + quota_type_invalid, + "invalid quota type (%llu >= %u)", + k.k->p.inode, QTYP_NR); +fsck_err: + return ret; } void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, @@ -513,12 +514,12 @@ void bch2_fs_quota_init(struct bch_fs *c) static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) { - struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); + struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); if (sb_quota) return sb_quota; - sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); + sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); if (sb_quota) { unsigned qtype, qc; @@ -536,7 +537,7 @@ static void bch2_sb_quota_read(struct bch_fs *c) struct bch_sb_field_quota *sb_quota; unsigned i, j; - sb_quota = bch2_sb_get_quota(c->disk_sb.sb); + sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); if (!sb_quota) return; @@ -572,7 +573,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, if (!s_t.master_subvol) goto advance; - ret = bch2_inode_find_by_inum_trans(trans, + ret = bch2_inode_find_by_inum_nowarn_trans(trans, (subvol_inum) { le32_to_cpu(s_t.master_subvol), k.k->p.offset, @@ -599,7 +600,7 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { struct bch_sb_field_quota *sb_quota; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; int ret; @@ -614,16 +615,16 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, + ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, POS_MIN, BTREE_ITER_PREFETCH, k, __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(&trans, iter, BTREE_ID_inodes, + for_each_btree_key2(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - bch2_fs_quota_read_inode(&trans, &iter, k)); + bch2_fs_quota_read_inode(trans, &iter, k)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -786,7 +787,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type, { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; - struct bch_memquota_type *q; int ret = 0; if (0) { @@ -810,8 +810,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type, ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) return -EINVAL; - q = &c->quotas[type]; - mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { @@ -959,7 +957,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: + bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); return bch2_err_class(ret); diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 2f46387..884f601 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -8,7 +8,7 @@ enum bkey_invalid_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index c3d5772..db2139c 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -1,17 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" #include "compress.h" #include "disk_groups.h" #include "errcode.h" -#include "extents.h" -#include "io.h" +#include "error.h" +#include "inode.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super-io.h" #include "trace.h" @@ -19,298 +23,396 @@ #include #include -/* - * Check if an extent should be moved: - * returns -1 if it should not be moved, or - * device of pointer that should be moved, if known, or INT_MAX if unknown - */ -static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + +static const char * const bch2_rebalance_state_strs[] = { +#define x(t) #t, + BCH_REBALANCE_STATES() + NULL +#undef x +}; + +static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i; - - data_opts->rewrite_ptrs = 0; - data_opts->target = io_opts->background_target; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - - if (io_opts->background_compression && - !bch2_bkey_is_incompressible(k)) { - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached && - p.crc.compression_type != - bch2_compression_opt_to_type(io_opts->background_compression)) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - } + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie *cookie; + u64 v; + int ret; - if (io_opts->background_target) { - const struct bch_extent_ptr *ptr; + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + ret = PTR_ERR_OR_ZERO(cookie); + if (ret) + goto err; + + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter.pos; + cookie->v.cookie = cpu_to_le64(v + 1); + + ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} - i = 0; - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && - bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - } +int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) +{ + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + __bch2_set_rebalance_needs_scan(trans, inum)); + rebalance_wakeup(c); + return ret; +} - return data_opts->rewrite_ptrs != 0; +int bch2_set_fs_needs_rebalance(struct bch_fs *c) +{ + return bch2_set_rebalance_needs_scan(c, 0); } -void bch2_rebalance_add_key(struct bch_fs *c, - struct bkey_s_c k, - struct bch_io_opts *io_opts) +static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) { - struct data_update_opts update_opts = { 0 }; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; - unsigned i; - - if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) - return; - - i = 0; - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - if ((1U << i) && update_opts.rewrite_ptrs) - if (atomic64_add_return(k.k->size, - &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == - k.k->size) - rebalance_wakeup(c); - i++; - } + struct btree_iter iter; + struct bkey_s_c k; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + if (v == cookie) + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } -void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + struct btree_iter *work_iter) { - if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == - sectors) - rebalance_wakeup(c); + return !kthread_should_stop() + ? bch2_btree_iter_peek(work_iter) + : bkey_s_c_null; } -struct rebalance_work { - int dev_most_full_idx; - unsigned dev_most_full_percent; - u64 dev_most_full_work; - u64 dev_most_full_capacity; - u64 total_work; -}; +static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + extent_entry_drop(bkey_i_to_s(n), + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +} -static void rebalance_work_accumulate(struct rebalance_work *w, - u64 dev_work, u64 unknown_dev, u64 capacity, int idx) +static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct bpos work_pos, + struct btree_iter *extent_iter, + struct data_update_opts *data_opts) { - unsigned percent_full; - u64 work = dev_work + unknown_dev; + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + + const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; + if (!r) { + /* raced due to btree write buffer, nothing to do */ + return bkey_s_c_null; + } - if (work < dev_work || work < unknown_dev) - work = U64_MAX; - work = min(work, capacity); + memset(data_opts, 0, sizeof(*data_opts)); - percent_full = div64_u64(work * 100, capacity); + data_opts->rewrite_ptrs = + bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); + data_opts->target = r->target; - if (percent_full >= w->dev_most_full_percent) { - w->dev_most_full_idx = idx; - w->dev_most_full_percent = percent_full; - w->dev_most_full_work = work; - w->dev_most_full_capacity = capacity; + if (!data_opts->rewrite_ptrs) { + /* + * device we would want to write to offline? devices in target + * changed? + * + * We'll now need a full scan before this extent is picked up + * again: + */ + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); + return bkey_s_c_null; } - if (w->total_work + dev_work >= w->total_work && - w->total_work + dev_work >= dev_work) - w->total_work += dev_work; + return k; } -static struct rebalance_work rebalance_work(struct bch_fs *c) +noinline_for_stack +static int do_rebalance_extent(struct moving_context *ctxt, + struct bpos work_pos, + struct btree_iter *extent_iter) { - struct bch_dev *ca; - struct rebalance_work ret = { .dev_most_full_idx = -1 }; - u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); - unsigned i; - - for_each_online_member(ca, c, i) - rebalance_work_accumulate(&ret, - atomic64_read(&ca->rebalance_work), - unknown_dev, - bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket), - i); - - rebalance_work_accumulate(&ret, - unknown_dev, 0, c->capacity, -1); + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &trans->c->rebalance; + struct data_update_opts data_opts; + struct bch_io_opts io_opts; + struct bkey_s_c k; + struct bkey_buf sk; + int ret; + + ctxt->stats = &r->work_stats; + r->state = BCH_REBALANCE_working; + + bch2_bkey_buf_init(&sk); + + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, + extent_iter, &data_opts)); + if (ret || !k.k) + goto out; + + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) + goto out; + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + if (ret) { + if (bch2_err_matches(ret, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + ret = -BCH_ERR_transaction_restart_nested; + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto out; + + /* skip it and continue, XXX signal failure */ + ret = 0; + } +out: + bch2_bkey_buf_exit(&sk, c); return ret; } -static void rebalance_work_reset(struct bch_fs *c) +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - struct bch_dev *ca; - unsigned i; + unsigned target, compression; - for_each_online_member(ca, c, i) - atomic64_set(&ca->rebalance_work, 0); + if (k.k->p.inode) { + target = io_opts->background_target; + compression = io_opts->background_compression ?: io_opts->compression; + } else { + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - atomic64_set(&c->rebalance.work_unknown_dev, 0); + target = r ? r->target : io_opts->background_target; + compression = r ? r->compression : + (io_opts->background_compression ?: io_opts->compression); + } + + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); + data_opts->target = target; + return data_opts->rewrite_ptrs != 0; } -static unsigned long curr_cputime(void) +static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { - u64 utime, stime; + struct btree_trans *trans = ctxt->trans; + struct bch_fs_rebalance *r = &trans->c->rebalance; + int ret; + + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + ctxt->stats = &r->scan_stats; + + if (!inum) { + r->scan_start = BBPOS_MIN; + r->scan_end = BBPOS_MAX; + } else { + r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); + r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } + + r->state = BCH_REBALANCE_scanning; + + ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - task_cputime_adjusted(current, &utime, &stime); - return nsecs_to_jiffies(utime + stime); + bch2_move_stats_exit(&r->scan_stats, trans->c); + return ret; } -static int bch2_rebalance_thread(void *arg) +static void rebalance_wait(struct bch_fs *c) { - struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct io_clock *clock = &c->io_clock[WRITE]; - struct rebalance_work w, p; - struct bch_move_stats move_stats; - unsigned long start, prev_start; - unsigned long prev_run_time, prev_run_cputime; - unsigned long cputime, prev_cputime; - u64 io_start; - long throttle; + u64 now = atomic64_read(&clock->now); + u64 min_member_capacity = bch2_min_rw_member_capacity(c); - set_freezable(); + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + r->wait_iotime_end = now + (min_member_capacity >> 6); - io_start = atomic64_read(&clock->now); - p = rebalance_work(c); - prev_start = jiffies; - prev_cputime = curr_cputime(); + if (r->state != BCH_REBALANCE_waiting) { + r->wait_iotime_start = now; + r->wait_wallclock_start = ktime_get_real_ns(); + r->state = BCH_REBALANCE_waiting; + } - bch2_move_stats_init(&move_stats, "rebalance"); - while (!kthread_wait_freezable(r->enabled)) { - cond_resched(); + bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} - start = jiffies; - cputime = curr_cputime(); +static int do_rebalance(struct moving_context *ctxt) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; - prev_run_time = start - prev_start; - prev_run_cputime = cputime - prev_cputime; + bch2_move_stats_init(&r->work_stats, "rebalance_work"); + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - w = rebalance_work(c); - BUG_ON(!w.dev_most_full_capacity); + bch2_trans_iter_init(trans, &rebalance_work_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS); - if (!w.total_work) { - r->state = REBALANCE_WAITING; - kthread_wait_freezable(rebalance_work(c).total_work); + while (!bch2_move_ratelimit(ctxt) && + !kthread_wait_freezable(r->enabled)) { + bch2_trans_begin(trans); + + ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; - } + if (ret || !k.k) + break; - /* - * If there isn't much work to do, throttle cpu usage: - */ - throttle = prev_run_cputime * 100 / - max(1U, w.dev_most_full_percent) - - prev_run_time; - - if (w.dev_most_full_percent < 20 && throttle > 0) { - r->throttled_until_iotime = io_start + - div_u64(w.dev_most_full_capacity * - (20 - w.dev_most_full_percent), - 50); - - if (atomic64_read(&clock->now) + clock->max_slop < - r->throttled_until_iotime) { - r->throttled_until_cputime = start + throttle; - r->state = REBALANCE_THROTTLED; - - bch2_kthread_io_clock_wait(clock, - r->throttled_until_iotime, - throttle); - continue; - } - } + ret = k.k->type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k.k->p.inode, + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) + : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - /* minimum 1 mb/sec: */ - r->pd.rate.rate = - max_t(u64, 1 << 11, - r->pd.rate.rate * - max(p.dev_most_full_percent, 1U) / - max(w.dev_most_full_percent, 1U)); - - io_start = atomic64_read(&clock->now); - p = w; - prev_start = start; - prev_cputime = cputime; - - r->state = REBALANCE_RUNNING; - memset(&move_stats, 0, sizeof(move_stats)); - rebalance_work_reset(c); - - bch2_move_data(c, - 0, POS_MIN, - BTREE_ID_NR, POS_MAX, - /* ratelimiting disabled for now */ - NULL, /* &r->pd.rate, */ - &move_stats, - writepoint_ptr(&c->rebalance_write_point), - true, - rebalance_pred, NULL); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_btree_iter_advance(&rebalance_work_iter); } - return 0; + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_work_iter); + bch2_move_stats_exit(&r->scan_stats, c); + + if (!ret && + !kthread_should_stop() && + !atomic64_read(&r->work_stats.sectors_seen) && + !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_trans_unlock_long(trans); + rebalance_wait(c); + } + + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + return ret; } -void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) +static int bch2_rebalance_thread(void *arg) { + struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; - struct rebalance_work w = rebalance_work(c); + struct moving_context ctxt; + int ret; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); + set_freezable(); - prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); - prt_tab(out); + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, + writepoint_ptr(&c->rebalance_write_point), + true); - prt_human_readable_u64(out, w.dev_most_full_work << 9); - prt_printf(out, "/"); - prt_human_readable_u64(out, w.dev_most_full_capacity << 9); - prt_newline(out); + while (!kthread_should_stop() && + !(ret = do_rebalance(&ctxt))) + ; - prt_printf(out, "total work:"); - prt_tab(out); + bch2_moving_ctxt_exit(&ctxt); - prt_human_readable_u64(out, w.total_work << 9); - prt_printf(out, "/"); - prt_human_readable_u64(out, c->capacity << 9); - prt_newline(out); + return 0; +} + +void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; - prt_printf(out, "rate:"); - prt_tab(out); - prt_printf(out, "%u", r->pd.rate.rate); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); + printbuf_indent_add(out, 2); switch (r->state) { - case REBALANCE_WAITING: - prt_printf(out, "waiting"); + case BCH_REBALANCE_waiting: { + u64 now = atomic64_read(&c->io_clock[WRITE].now); + + prt_str(out, "io wait duration: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + prt_newline(out); + + prt_str(out, "io wait remaining: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + prt_newline(out); + + prt_str(out, "duration waited: "); + bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); + prt_newline(out); break; - case REBALANCE_THROTTLED: - prt_printf(out, "throttled for %lu sec or ", - (r->throttled_until_cputime - jiffies) / HZ); - prt_human_readable_u64(out, - (r->throttled_until_iotime - - atomic64_read(&c->io_clock[WRITE].now)) << 9); - prt_printf(out, " io"); + } + case BCH_REBALANCE_working: + bch2_move_stats_to_text(out, &r->work_stats); break; - case REBALANCE_RUNNING: - prt_printf(out, "running"); + case BCH_REBALANCE_scanning: + bch2_move_stats_to_text(out, &r->scan_stats); break; } prt_newline(out); + printbuf_indent_sub(out, 2); } void bch2_rebalance_stop(struct bch_fs *c) @@ -346,7 +448,7 @@ int bch2_rebalance_start(struct bch_fs *c) p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ret = PTR_ERR_OR_ZERO(p); if (ret) { - bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating rebalance thread"); return ret; } @@ -359,6 +461,4 @@ int bch2_rebalance_start(struct bch_fs *c) void bch2_fs_rebalance_init(struct bch_fs *c) { bch2_pd_controller_init(&c->rebalance.pd); - - atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); } diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h index 7ade0bb..28a5263 100644 --- a/libbcachefs/rebalance.h +++ b/libbcachefs/rebalance.h @@ -4,6 +4,9 @@ #include "rebalance_types.h" +int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); +int bch2_set_fs_needs_rebalance(struct bch_fs *); + static inline void rebalance_wakeup(struct bch_fs *c) { struct task_struct *p; @@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c) rcu_read_unlock(); } -void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, - struct bch_io_opts *); -void bch2_rebalance_add_work(struct bch_fs *, u64); - -void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); +void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h index 7462a92..0fffb53 100644 --- a/libbcachefs/rebalance_types.h +++ b/libbcachefs/rebalance_types.h @@ -2,25 +2,36 @@ #ifndef _BCACHEFS_REBALANCE_TYPES_H #define _BCACHEFS_REBALANCE_TYPES_H +#include "bbpos_types.h" #include "move_types.h" -enum rebalance_state { - REBALANCE_WAITING, - REBALANCE_THROTTLED, - REBALANCE_RUNNING, +#define BCH_REBALANCE_STATES() \ + x(waiting) \ + x(working) \ + x(scanning) + +enum bch_rebalance_states { +#define x(t) BCH_REBALANCE_##t, + BCH_REBALANCE_STATES() +#undef x }; struct bch_fs_rebalance { - struct task_struct __rcu *thread; + struct task_struct __rcu *thread; struct bch_pd_controller pd; - atomic64_t work_unknown_dev; + enum bch_rebalance_states state; + u64 wait_iotime_start; + u64 wait_iotime_end; + u64 wait_wallclock_start; + + struct bch_move_stats work_stats; - enum rebalance_state state; - u64 throttled_until_iotime; - unsigned long throttled_until_cputime; + struct bbpos scan_start; + struct bbpos scan_end; + struct bch_move_stats scan_stats; - unsigned enabled:1; + unsigned enabled:1; }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 55a233c..83fc121 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -5,6 +5,7 @@ #include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -19,10 +20,14 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "lru.h" +#include "logged_ops.h" #include "move.h" #include "quota.h" +#include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "sb-clean.h" +#include "snapshot.h" #include "subvolume.h" #include "super-io.h" @@ -31,13 +36,27 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + /* for -o reconstruct_alloc: */ static void drop_alloc_keys(struct journal_keys *keys) { size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) - if (keys->d[src].btree_id != BTREE_ID_alloc) + if (!btree_id_is_alloc(keys->d[src].btree_id)) keys->d[dst++] = keys->d[src]; keys->nr = dst; @@ -57,524 +76,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } -/* iterate over keys read from the journal: */ - -static int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); -} - -static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - -static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -{ - size_t gap_size = keys->size - keys->nr; - - if (idx >= keys->gap) - idx += gap_size; - return idx; -} - -static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -{ - return keys->d + idx_to_pos(keys, idx); -} - -static size_t __bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - size_t l = 0, r = keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); - - BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); - - return l; -} - -static size_t bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -} - -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; - - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) - return k->k; - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - return NULL; -} - -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) -{ - size_t idx = 0; - - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); -} - -static void journal_iters_fix(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - /* The key we just inserted is immediately before the gap: */ - size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; - - /* - * If an iterator points one after the key we just inserted, decrement - * the iterator so it points at the key we just inserted - if the - * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will - * handle that: - */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; -} - -static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -{ - struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; - size_t gap_size = keys->size - keys->nr; - - list_for_each_entry(iter, &c->journal_iters, list) { - if (iter->idx > old_gap) - iter->idx -= gap_size; - if (iter->idx >= new_gap) - iter->idx += gap_size; - } -} - -int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct journal_key n = { - .btree_id = id, - .level = level, - .k = k, - .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U32_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); - - if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; - return 0; - } - - if (idx > keys->gap) - idx -= keys->size - keys->nr; - - if (keys->nr == keys->size) { - struct journal_keys new_keys = { - .nr = keys->nr, - .size = max_t(size_t, keys->size, 8) * 2, - }; - - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; - } - - /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - *keys = new_keys; - - /* And now the gap is at the end: */ - keys->gap = keys->nr; - } - - journal_iters_move_gap(c, keys->gap, idx); - - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; - - keys->nr++; - keys->d[keys->gap++] = n; - - journal_iters_fix(c); - - return 0; -} - -/* - * Can only be used from the recovery thread while we're still RO - can't be - * used once we've got RW, as journal_keys is at that point used by multiple - * threads: - */ -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct bkey_i *n; - int ret; - - n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; - - bkey_copy(n, k); - ret = bch2_journal_key_insert_take(c, id, level, n); - if (ret) - kfree(n); - return ret; -} - -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) -{ - struct bkey_i whiteout; - - bkey_init(&whiteout.k); - whiteout.k.p = pos; - - return bch2_journal_key_insert(c, id, level, &whiteout); -} - -void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->size) { - iter->idx++; - if (iter->idx == iter->keys->gap) - iter->idx += iter->keys->size - iter->keys->nr; - } -} - -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -{ - struct journal_key *k = iter->keys->d + iter->idx; - - while (k < iter->keys->d + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { - if (!k->overwritten) - return bkey_i_to_s_c(k->k); - - bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; - } - - return bkey_s_c_null; -} - -static void bch2_journal_iter_exit(struct journal_iter *iter) -{ - list_del(&iter->list); -} - -static void bch2_journal_iter_init(struct bch_fs *c, - struct journal_iter *iter, - enum btree_id id, unsigned level, - struct bpos pos) -{ - iter->btree_id = id; - iter->level = level; - iter->keys = &c->journal_keys; - iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); -} - -static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -{ - return bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); -} - -static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -{ - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -} - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -{ - if (bpos_eq(iter->pos, SPOS_MAX)) - iter->at_end = true; - else - iter->pos = bpos_successor(iter->pos); -} - -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -{ - struct bkey_s_c btree_k, journal_k, ret; -again: - if (iter->at_end) - return bkey_s_c_null; - - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); - - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); - - ret = journal_k.k && - (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) - ? journal_k - : btree_k; - - if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) - ret = bkey_s_c_null; - - if (ret.k) { - iter->pos = ret.k->p; - if (bkey_deleted(ret.k)) { - bch2_btree_and_journal_iter_advance(iter); - goto again; - } - } else { - iter->pos = SPOS_MAX; - iter->at_end = true; - } - - return ret; -} - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -{ - bch2_journal_iter_exit(&iter->journal); -} - -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b, - struct btree_node_iter node_iter, - struct bpos pos) -{ - memset(iter, 0, sizeof(*iter)); - - iter->b = b; - iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); - iter->pos = b->data->min_key; - iter->at_end = false; -} - -/* - * this version is used by btree_gc before filesystem has gone RW and - * multithreaded, so uses the journal_iters list: - */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) -{ - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); -} - -/* sort and dedup all keys in the journal: */ - -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); - genradix_free(&c->journal_entries); -} - -/* - * When keys compare equal, oldest compares first: - */ -static int journal_sort_key_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - - return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); -} - -void bch2_journal_keys_free(struct journal_keys *keys) -{ - struct journal_key *i; - - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - for (i = keys->d; i < keys->d + keys->nr; i++) - if (i->allocated) - kfree(i->k); - - kvfree(keys->d); - keys->d = NULL; - keys->nr = keys->gap = keys->size = 0; -} - -static void __journal_keys_sort(struct journal_keys *keys) -{ - struct journal_key *src, *dst; - - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); - - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) - src++; - - *dst++ = *src++; - } - - keys->nr = dst - keys->d; -} - -static int journal_keys_sort(struct bch_fs *c) -{ - struct genradix_iter iter; - struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; - struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); - - if (!keys->d) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - cond_resched(); - - for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->d[keys->nr++] = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - nr_read++; - } - } - - __journal_keys_sort(keys); - keys->gap = keys->nr; - - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); - return 0; -} - /* journal replay: */ static void replay_now_at(struct journal *j, u64 seq) @@ -597,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + if (k->overwritten) + return 0; + + trans->journal_res.seq = k->journal_seq; + /* * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing @@ -638,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) static int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key **keys_sorted, *k; + DARRAY(struct journal_key *) keys_sorted = { 0 }; + struct journal_key **kp; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - size_t i; + struct btree_trans *trans = bch2_trans_get(c); int ret; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); - if (!keys_sorted) - return -BCH_ERR_ENOMEM_journal_replay; - - for (i = 0; i < keys->nr; i++) - keys_sorted[i] = &keys->d[i]; - - sort(keys_sorted, keys->nr, - sizeof(keys_sorted[0]), - journal_sort_seq_cmp, NULL); - if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); @@ -666,27 +159,65 @@ static int bch2_journal_replay(struct bch_fs *c) goto err; } - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; + /* + * First, attempt to replay keys in sorted order. This is more + * efficient - better locality of btree access - but some might fail if + * that would cause a journal deadlock. + */ + for (size_t i = 0; i < keys->nr; i++) { + cond_resched(); + + struct journal_key *k = keys->d + i; + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + bch2_journal_replay_key(trans, k)); + BUG_ON(!ret && !k->overwritten); + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) + goto err; + } + } + + /* + * Now, replay any remaining keys in the order in which they appear in + * the journal, unpinning those journal entries as we go: + */ + sort(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); + + darray_for_each(keys_sorted, kp) { cond_resched(); + struct journal_key *k = *kp; + replay_now_at(j, k->journal_seq); - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim - : 0), - bch2_journal_replay_key(&trans, k)); - if (ret) { - bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + (!k->allocated + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), + bch2_journal_replay_key(trans, k)); + bch_err_msg(c, ret, "while replaying key at btree %s level %u:", + bch2_btree_id_str(k->btree_id), k->level); + if (ret) goto err; - } + + BUG_ON(!k->overwritten); } + /* + * We need to put our btree_trans before calling flush_all_pins(), since + * that will use a btree_trans internally + */ + bch2_trans_put(trans); + trans = NULL; + replay_now_at(j, j->replay_journal_seq_end); j->replay_journal_seq = 0; @@ -697,10 +228,10 @@ static int bch2_journal_replay(struct bch_fs *c) if (keys->nr && !ret) bch2_journal_log_msg(c, "journal replay finished"); err: - kvfree(keys_sorted); - - if (ret) - bch_err_fn(c, ret); + if (trans) + bch2_trans_put(trans); + darray_exit(&keys_sorted); + bch_err_fn(c, ret); return ret; } @@ -725,7 +256,7 @@ static int journal_replay_entry_early(struct bch_fs *c, if (entry->u64s) { r->level = entry->level; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { r->error = -EIO; @@ -846,148 +377,6 @@ static int journal_replay_early(struct bch_fs *c, /* sb clean section: */ -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); - } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; -} - -static int verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; - - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) - continue; - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - if (k1) - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); - else - prt_printf(&buf1, "(none)"); - - if (k2) - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); - else - prt_printf(&buf2, "(none)"); - - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(&k1->k)) || - l1 != l2, c, - "superblock btree root %u doesn't match journal after clean shutdown\n" - "sb: l=%u %s\n" - "journal: l=%u %s\n", i, - l1, buf1.buf, - l2, buf2.buf); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - - if (fsck_err_on(!sb_clean, c, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return NULL; - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } - - ret = bch2_sb_clean_validate_late(c, clean, READ); - if (ret) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); - } - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} - -static bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - return true; - default: - return false; - } -} - static int read_btree_roots(struct bch_fs *c) { unsigned i; @@ -1006,23 +395,25 @@ static int read_btree_roots(struct bch_fs *c) } if (r->error) { - __fsck_err(c, btree_id_is_alloc(i) + __fsck_err(c, + btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, + btree_root_bkey_invalid, "invalid btree root %s", - bch2_btree_ids[i]); + bch2_btree_id_str(i)); if (i == BTREE_ID_alloc) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } ret = bch2_btree_root_read(c, i, &r->key, r->level); if (ret) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - "error reading btree root %s", - bch2_btree_ids[i]); + fsck_err(c, + btree_root_read_error, + "error reading btree root %s", + bch2_btree_id_str(i)); if (btree_id_is_alloc(i)) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + ret = 0; } } @@ -1065,15 +456,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) root_volume.v.snapshot = cpu_to_le32(U32_MAX); root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, - &root_tree.k_i, - NULL, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, - &root_snapshot.k_i, - NULL, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, - &root_volume.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); if (ret) bch_err_fn(c, ret); return ret; @@ -1113,16 +498,57 @@ err: noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, - __bch2_fs_upgrade_for_subvolumes(&trans)); + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); if (ret) bch_err_fn(c, ret); return ret; } +const char * const bch2_recovery_passes[] = { +#define x(_fn, _when) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + if (keys->nr) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + static void check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_version_compatible(c->sb.version); + unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -1172,7 +598,12 @@ static void check_version_upgrade(struct bch_fs *c) recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); if (recovery_passes) { - prt_str(&buf, "fsck required"); + if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) + prt_str(&buf, "fsck required"); + else { + prt_str(&buf, "running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); + } c->recovery_passes_explicit |= recovery_passes; c->opts.fix_errors = FSCK_FIX_yes; @@ -1188,42 +619,19 @@ static void check_version_upgrade(struct bch_fs *c) } } -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - const char *name; - unsigned when; -}; - -static struct recovery_pass_fn recovery_passes[] = { -#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) - if (recovery_passes[i].when & PASS_FSCK) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) ret |= BIT_ULL(i); return ret; } static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -1245,15 +653,18 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) c->curr_recovery_pass = pass; if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_passes + pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); + printk(KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); ret = p->fn(c); if (ret) return ret; if (!(p->when & PASS_SILENT)) printk(KERN_CONT " done\n"); + + c->recovery_passes_complete |= BIT_ULL(pass); } return 0; @@ -1263,7 +674,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) continue; @@ -1279,21 +690,21 @@ int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; struct jset *last_journal_entry = NULL; - u64 last_seq, blacklist_seq, journal_seq; + u64 last_seq = 0, blacklist_seq, journal_seq; bool write_sb = false; int ret = 0; - if (c->sb.clean) - clean = read_superblock_clean(c); - ret = PTR_ERR_OR_ZERO(clean); - if (ret) - goto err; + if (c->sb.clean) { + clean = bch2_read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; - if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); - else + } else { bch_info(c, "recovering from unclean shutdown"); + } if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -1308,12 +719,6 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { - bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); - ret = -EINVAL; - goto err; - } - if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) check_version_upgrade(c); @@ -1354,6 +759,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (mustfix_fsck_err_on(c->sb.clean && last_journal_entry && !journal_entry_empty(last_journal_entry), c, + clean_but_journal_not_empty, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); @@ -1361,7 +767,9 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!last_journal_entry) { - fsck_err_on(!c->sb.clean, c, "no journal entries found"); + fsck_err_on(!c->sb.clean, c, + dirty_but_no_journal_entries, + "no journal entries found"); if (clean) goto use_clean; @@ -1369,16 +777,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (*i) { last_journal_entry = &(*i)->j; (*i)->ignore = false; + /* + * This was probably a NO_FLUSH entry, + * so last_seq was garbage - but we know + * we're only using a single journal + * entry, set it here: + */ + (*i)->j.last_seq = (*i)->j.seq; break; } } - ret = journal_keys_sort(c); + ret = bch2_journal_keys_sort(c); if (ret) goto err; if (c->sb.clean && last_journal_entry) { - ret = verify_superblock_clean(c, &clean, + ret = bch2_verify_superblock_clean(c, &clean, last_journal_entry); if (ret) goto err; @@ -1395,7 +810,7 @@ use_clean: } c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1;; + c->journal_replay_seq_end = blacklist_seq - 1; if (c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); @@ -1463,6 +878,8 @@ use_clean: test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && !test_bit(BCH_FS_ERROR, &c->flags)) { + bch2_flush_fsck_errs(c); + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); @@ -1513,7 +930,6 @@ use_clean: mutex_unlock(&c->sb_lock); if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { struct bch_move_stats stats; @@ -1543,7 +959,7 @@ out: } kfree(clean); - if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { + if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } @@ -1581,23 +997,19 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); + c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_online_member(ca, c, i) + for_each_member_device(ca, c, i) bch2_dev_usage_init(ca); - for_each_online_member(ca, c, i) { - ret = bch2_dev_journal_alloc(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } + ret = bch2_fs_journal_alloc(c); + if (ret) + goto err; /* * journal_res_get() will crash if called before this has @@ -1615,15 +1027,13 @@ int bch2_fs_initialize(struct bch_fs *c) * btree updates */ bch_verbose(c, "marking superblocks"); - for_each_member_device(ca, c, i) { - ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } + ret = bch2_trans_mark_dev_sbs(c); + bch_err_msg(c, ret, "marking superblocks"); + if (ret) + goto err; + for_each_online_member(ca, c, i) ca->new_fs_bucket_idx = 0; - } ret = bch2_fs_freespace_init(c); if (ret) @@ -1645,9 +1055,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_inodes, - &packed_inode.inode.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "creating root directory"); goto err; @@ -1656,7 +1064,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init_early(c, &lostfound_inode); ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, + bch2_create_trans(trans, BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index f8e796c..852d305 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -2,55 +2,28 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -struct journal_iter { - struct list_head list; - enum btree_id btree_id; - unsigned level; - size_t idx; - struct journal_keys *keys; -}; +extern const char * const bch2_recovery_passes[]; /* - * Iterate over keys in the btree, with keys from the journal overlaid on top: + * For when we need to rewind recovery passes and run a pass we skipped: */ - -struct btree_and_journal_iter { - struct btree *b; - struct btree_node_iter node_iter; - struct bkey unpacked; - - struct journal_iter journal; - struct bpos pos; - bool at_end; -}; - -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_insert(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_delete(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, - struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); - -void bch2_journal_keys_free(struct journal_keys *); -void bch2_journal_entries_free(struct bch_fs *); +static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} u64 bch2_fsck_recovery_passes(void); diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h index abf1f83..515e3d6 100644 --- a/libbcachefs/recovery_types.h +++ b/libbcachefs/recovery_types.h @@ -14,6 +14,8 @@ x(snapshots_read, PASS_ALWAYS) \ x(check_topology, 0) \ x(check_allocations, PASS_FSCK) \ + x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, PASS_ALWAYS) \ x(check_alloc_info, PASS_FSCK) \ @@ -27,10 +29,12 @@ x(check_snapshot_trees, PASS_FSCK) \ x(check_snapshots, PASS_FSCK) \ x(check_subvols, PASS_FSCK) \ - x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ + x(delete_dead_snapshots, PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 0) \ + x(resume_logged_ops, PASS_ALWAYS) \ x(check_inodes, PASS_FSCK) \ x(check_extents, PASS_FSCK) \ + x(check_indirect_extents, PASS_FSCK) \ x(check_dirents, PASS_FSCK) \ x(check_xattrs, PASS_FSCK) \ x(check_root, PASS_FSCK) \ @@ -38,6 +42,7 @@ x(check_nlinks, PASS_FSCK) \ x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 0) \ + x(set_fs_needs_rebalance, 0) \ enum bch_recovery_pass { #define x(n, when) BCH_RECOVERY_PASS_##n, diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 39f711d..07ddf3e 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -5,9 +5,12 @@ #include "buckets.h" #include "extents.h" #include "inode.h" -#include "io.h" +#include "io_misc.h" +#include "io_write.h" +#include "rebalance.h" #include "reflink.h" #include "subvolume.h" +#include "super-io.h" #include @@ -25,7 +28,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -72,7 +75,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ -int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -89,6 +92,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +#if 0 +Currently disabled, needs to be debugged: + bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); @@ -96,29 +102,31 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); } +#endif + +static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +{ + if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + new->k.type = KEY_TYPE_deleted; + new->k.size = 0; + set_bkey_val_u64s(&new->k, 0);; + *flags &= ~BTREE_TRIGGER_INSERT; + } +} int bch2_trans_mark_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); - - if (!r->v.refcount) { - r->k.type = KEY_TYPE_deleted; - r->k.size = 0; - set_bkey_val_u64s(&r->k, 0); - return 0; - } - } + check_indirect_extent_deleting(new, &flags); return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ -int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -126,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, } void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) + struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); unsigned datalen = bkey_inline_data_bytes(k.k); @@ -141,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - struct bkey_i_indirect_inline_data *r = - bkey_i_to_indirect_inline_data(new); - - if (!r->v.refcount) { - r->k.type = KEY_TYPE_deleted; - r->k.size = 0; - set_bkey_val_u64s(&r->k, 0); - } - } + check_indirect_extent_deleting(new, &flags); return 0; } @@ -247,15 +246,16 @@ s64 bch2_remap_range(struct bch_fs *c, u64 remap_sectors, u64 new_i_size, s64 *i_sectors_delta) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; struct bpos dst_start = POS(dst_inum.inum, dst_offset); struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; + struct bch_io_opts opts; struct bpos src_want; - u64 dst_done; + u64 dst_done = 0; u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; @@ -269,11 +269,15 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_dst); bch2_bkey_buf_init(&new_src); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + trans = bch2_trans_get(c); - bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, + ret = bch2_inum_opts_get(trans, src_inum, &opts); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_INTENT); - bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, BTREE_ITER_INTENT); while ((ret == 0 || @@ -281,21 +285,21 @@ s64 bch2_remap_range(struct bch_fs *c, bkey_lt(dst_iter.pos, dst_end)) { struct disk_reservation disk_res = { 0 }; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); if (fatal_signal_pending(current)) { ret = -EINTR; break; } - ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, &src_snapshot); if (ret) continue; bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); - ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; @@ -312,7 +316,7 @@ s64 bch2_remap_range(struct bch_fs *c, continue; if (bkey_lt(src_want, src_iter.pos)) { - ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, min(dst_end.offset, dst_iter.pos.offset + src_iter.pos.offset - src_want.offset), @@ -326,7 +330,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); - ret = bch2_make_extent_indirect(&trans, &src_iter, + ret = bch2_make_extent_indirect(trans, &src_iter, new_src.k); if (ret) continue; @@ -354,14 +358,17 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_extent_update(&trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, + opts.background_target, + opts.background_compression) ?: + bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true); bch2_disk_reservation_put(c, &disk_res); } - bch2_trans_iter_exit(&trans, &dst_iter); - bch2_trans_iter_exit(&trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + bch2_trans_iter_exit(trans, &src_iter); BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); BUG_ON(bkey_gt(dst_iter.pos, dst_end)); @@ -373,23 +380,23 @@ s64 bch2_remap_range(struct bch_fs *c, struct bch_inode_unpacked inode_u; struct btree_iter inode_iter = { NULL }; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); } - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); - - bch2_trans_exit(&trans); +err: + bch2_trans_put(trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index fe52538..8ccf3f9 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -4,7 +4,7 @@ enum bkey_invalid_flags; -int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -19,7 +19,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -35,7 +35,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 5b591c5..1c3ae13 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -429,7 +429,7 @@ out: return ret; err: - bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "adding replicas entry"); goto out; } @@ -462,18 +462,13 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) { lockdep_assert_held(&c->replicas_gc_lock); - if (ret) - goto err; - mutex_lock(&c->sb_lock); percpu_down_write(&c->mark_lock); - ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (ret) - goto err; + ret = ret ?: + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: + replicas_table_update(c, &c->replicas_gc); - ret = replicas_table_update(c, &c->replicas_gc); -err: kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; @@ -579,12 +574,9 @@ retry: bch2_cpu_replicas_sort(&new); - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - if (ret) - goto err; + ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: + replicas_table_update(c, &new); - ret = replicas_table_update(c, &new); -err: kfree(new.entries); percpu_up_write(&c->mark_lock); @@ -700,9 +692,9 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) struct bch_replicas_cpu new_r = { 0, 0, NULL }; int ret = 0; - if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) + if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); - else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) + else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); if (ret) return ret; @@ -732,13 +724,13 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, for_each_cpu_replicas_entry(r, src) bytes += replicas_entry_bytes(src) - 1; - sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); - sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); memset(&sb_r->entries, 0, vstruct_end(&sb_r->field) - @@ -777,13 +769,13 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, if (!need_v1) return bch2_cpu_replicas_to_sb_replicas_v0(c, r); - sb_r = bch2_sb_resize_replicas(&c->disk_sb, + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); - sb_r = bch2_sb_get_replicas(c->disk_sb.sb); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); memset(&sb_r->entries, 0, vstruct_end(&sb_r->field) - @@ -805,7 +797,6 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_sb *sb, struct printbuf *err) { - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); unsigned i, j; sort_cmp_size(cpu_r->entries, @@ -837,7 +828,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } for (j = 0; j < e->nr_devs; j++) - if (!bch2_dev_exists(sb, mi, e->devs[j])) { + if (!bch2_dev_exists(sb, e->devs[j])) { prt_printf(err, "invalid device %u in entry ", e->devs[j]); bch2_replicas_entry_to_text(err, e); return -BCH_ERR_invalid_sb_replicas; @@ -999,8 +990,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) struct bch_sb_field_replicas_v0 *replicas_v0; unsigned i, data_has = 0; - replicas = bch2_sb_get_replicas(sb); - replicas_v0 = bch2_sb_get_replicas_v0(sb); + replicas = bch2_sb_field_get(sb, replicas); + replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { struct bch_replicas_entry *r; diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index ae21a8c..89fdb7c 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -15,6 +15,16 @@ #include #include +typedef unsigned __bitwise bch_str_hash_flags_t; + +enum bch_str_hash_flags { + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) + static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - int flags, + bch_str_hash_flags_t str_hash_flags, int update_flags) { struct btree_iter iter, slot = { NULL }; @@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, } if (!slot.path && - !(flags & BCH_HASH_SET_MUST_REPLACE)) + !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -287,16 +297,16 @@ found: found = true; not_found: - if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, 0); + ret = bch2_trans_update(trans, &iter, insert, update_flags); } goto out; @@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, - struct bkey_i *insert, int flags) + struct bkey_i *insert, + bch_str_hash_flags_t str_hash_flags) { u32 snapshot; int ret; @@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans, insert->k.p.inode = inum.inum; return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, flags, 0); + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 736afb6..1cbf9e3 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -6,866 +6,13 @@ #include "errcode.h" #include "error.h" #include "fs.h" +#include "snapshot.h" #include "subvolume.h" #include static int bch2_subvolume_delete(struct btree_trans *, u32); -static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - - if (s->skip[2] <= ancestor) - return s->skip[2]; - if (s->skip[1] <= ancestor) - return s->skip[1]; - if (s->skip[0] <= ancestor) - return s->skip[0]; - return s->parent; -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - struct snapshot_table *t; - bool ret; - - EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); - - rcu_read_lock(); - t = rcu_dereference(c->snapshots); - - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); - - ret = id && id < ancestor - ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) - : id == ancestor; - rcu_read_unlock(); - - return ret; -} - -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -{ - struct snapshot_table *t; - - rcu_read_lock(); - t = rcu_dereference(c->snapshots); - - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; - rcu_read_unlock(); - - return id == ancestor; -} - -static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -{ - u32 depth; - - rcu_read_lock(); - depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; - rcu_read_unlock(); - - return depth; -} - -static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - size_t new_size; - struct snapshot_table *new, *old; - - new_size = max(16UL, roundup_pow_of_two(idx + 1)); - - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); - if (!new) - return NULL; - - old = rcu_dereference_protected(c->snapshots, true); - if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); - - rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - if (old) - kvfree_rcu(old); - - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; -} - -static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - - lockdep_assert_held(&c->snapshot_table_lock); - - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; - - return __snapshot_t_mut(c, id); -} - -/* Snapshot tree: */ - -void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); - - prt_printf(out, "subvol %u root snapshot %u", - le32_to_cpu(t.v->master_subvol), - le32_to_cpu(t.v->root_snapshot)); -} - -int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) -{ - if (bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1))) { - prt_printf(err, "bad pos"); - return -BCH_ERR_invalid_bkey; - } - - return 0; -} - -int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot_tree *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot_tree, s); - - if (bch2_err_matches(ret, ENOENT)) - ret = -BCH_ERR_ENOENT_snapshot_tree; - return ret; -} - -static struct bkey_i_snapshot_tree * -__snapshot_tree_create(struct btree_trans *trans) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, U32_MAX)); - struct bkey_i_snapshot_tree *s_t; - - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_snapshot_tree; - if (ret) - return ERR_PTR(ret); - - s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : s_t; -} - -static int snapshot_tree_create(struct btree_trans *trans, - u32 root_id, u32 subvol_id, u32 *tree_id) -{ - struct bkey_i_snapshot_tree *n_tree = - __snapshot_tree_create(trans); - - if (IS_ERR(n_tree)) - return PTR_ERR(n_tree); - - n_tree->v.master_subvol = cpu_to_le32(subvol_id); - n_tree->v.root_snapshot = cpu_to_le32(root_id); - *tree_id = n_tree->k.p.offset; - return 0; -} - -/* Snapshot nodes: */ - -void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), - le32_to_cpu(s.v->parent), - le32_to_cpu(s.v->children[0]), - le32_to_cpu(s.v->children[1]), - le32_to_cpu(s.v->subvol), - le32_to_cpu(s.v->tree)); - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) - prt_printf(out, " depth %u skiplist %u %u %u", - le32_to_cpu(s.v->depth), - le32_to_cpu(s.v->skip[0]), - le32_to_cpu(s.v->skip[1]), - le32_to_cpu(s.v->skip[2])); -} - -int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) -{ - struct bkey_s_c_snapshot s; - u32 i, id; - - if (bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1))) { - prt_printf(err, "bad pos"); - return -BCH_ERR_invalid_bkey; - } - - s = bkey_s_c_to_snapshot(k); - - id = le32_to_cpu(s.v->parent); - if (id && id <= k.k->p.offset) { - prt_printf(err, "bad parent node (%u <= %llu)", - id, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } - - if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { - prt_printf(err, "children not normalized"); - return -BCH_ERR_invalid_bkey; - } - - if (s.v->children[0] && - s.v->children[0] == s.v->children[1]) { - prt_printf(err, "duplicate child nodes"); - return -BCH_ERR_invalid_bkey; - } - - for (i = 0; i < 2; i++) { - id = le32_to_cpu(s.v->children[i]); - - if (id >= k.k->p.offset) { - prt_printf(err, "bad child node (%u >= %llu)", - id, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } - } - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { - if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { - prt_printf(err, "skiplist not normalized"); - return -BCH_ERR_invalid_bkey; - } - - for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { - id = le32_to_cpu(s.v->skip[i]); - - if (!id != !s.v->parent || - (s.v->parent && - id <= k.k->p.offset)) { - prt_printf(err, "bad skiplist node %u)", id); - return -BCH_ERR_invalid_bkey; - } - } - } - - return 0; -} - -int bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct snapshot_t *t; - u32 id = new.k->p.offset; - int ret = 0; - - mutex_lock(&c->snapshot_table_lock); - - t = snapshot_t_mut(c, id); - if (!t) { - ret = -BCH_ERR_ENOMEM_mark_snapshot; - goto err; - } - - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - u32 parent = id; - - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); - t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; - t->tree = le32_to_cpu(s.v->tree); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { - t->depth = le32_to_cpu(s.v->depth); - t->skip[0] = le32_to_cpu(s.v->skip[0]); - t->skip[1] = le32_to_cpu(s.v->skip[1]); - t->skip[2] = le32_to_cpu(s.v->skip[2]); - } else { - t->depth = 0; - t->skip[0] = 0; - t->skip[1] = 0; - t->skip[2] = 0; - } - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); - - if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); - } - } else { - memset(t, 0, sizeof(*t)); - } -err: - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -static int snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s) -{ - return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot, s); -} - -static int snapshot_live(struct btree_trans *trans, u32 id) -{ - struct bch_snapshot v; - int ret; - - if (!id) - return 0; - - ret = snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot node %u not found", id); - if (ret) - return ret; - - return !BCH_SNAPSHOT_DELETED(&v); -} - -static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - unsigned i, nr_live = 0, live_idx = 0; - struct bkey_s_c_snapshot snap; - u32 id = k.k->p.offset, child[2]; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); - - for (i = 0; i < 2; i++) { - int ret = snapshot_live(trans, child[i]); - - if (ret < 0) - return ret; - - if (ret) - live_idx = i; - nr_live += ret; - } - - mutex_lock(&c->snapshot_table_lock); - - snapshot_t_mut(c, id)->equiv = nr_live == 1 - ? snapshot_t_mut(c, child[live_idx])->equiv - : id; - - mutex_unlock(&c->snapshot_table_lock); - - return 0; -} - -/* fsck: */ - -static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -{ - return snapshot_t(c, id)->children[child]; -} - -static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 0); -} - -static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 1); -} - -static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -{ - u32 n, parent; - - n = bch2_snapshot_left_child(c, id); - if (n) - return n; - - while ((parent = bch2_snapshot_parent(c, id))) { - n = bch2_snapshot_right_child(c, parent); - if (n && n != id) - return n; - id = parent; - } - - return 0; -} - -static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) -{ - u32 id = snapshot_root; - u32 subvol = 0, s; - - while (id) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; - - id = bch2_snapshot_tree_next(c, id); - } - - return subvol; -} - -static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - u32 snapshot_root, u32 *subvol_id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_subvolume s; - bool found = false; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - s = bkey_s_c_to_subvolume(k); - if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) - continue; - if (!BCH_SUBVOLUME_SNAP(s.v)) { - *subvol_id = s.k->p.offset; - found = true; - break; - } - } - - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { - struct bkey_i_subvolume *s; - - *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); - - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - SET_BCH_SUBVOLUME_SNAP(&s->v, false); - } - - return ret; -} - -static int check_snapshot_tree(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot_tree st; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; - u32 root_id; - int ret; - - if (k.k->type != KEY_TYPE_snapshot_tree) - return 0; - - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); - - ret = snapshot_lookup(trans, root_id, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret || - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - c, - "snapshot tree points to missing/incorrect snapshot:\n %s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } - - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), - false, 0, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, - "snapshot tree points to missing subvolume:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor_early(c, - le32_to_cpu(subvol.snapshot), - root_id), c, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, - "snapshot tree points to snapshot subvolume:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - struct bkey_i_snapshot_tree *u; - u32 subvol_id; - - ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); - if (ret) - goto err; - - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.master_subvol = cpu_to_le32(subvol_id); - st = snapshot_tree_i_to_s_c(u); - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * For each snapshot_tree, make sure it points to the root of a snapshot tree - * and that snapshot entry points back to it, or delete it. - * - * And, make sure it points to a subvolume within that snapshot tree, or correct - * it to point to the oldest subvolume within that snapshot tree. - */ -int bch2_check_snapshot_trees(struct bch_fs *c) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, - BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot_tree(&trans, &iter, k))); - - if (ret) - bch_err(c, "error %i checking snapshot trees", ret); - return ret; -} - -/* - * Look up snapshot tree for @tree_id and find root, - * make sure @snap_id is a descendent: - */ -static int snapshot_tree_ptr_good(struct btree_trans *trans, - u32 snap_id, u32 tree_id) -{ - struct bch_snapshot_tree s_t; - int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - - if (bch2_err_matches(ret, ENOENT)) - return 0; - if (ret) - return ret; - - return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -} - -static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - - if (!id) - return 0; - - rcu_read_lock(); - s = snapshot_t(c, id); - if (s->parent) - id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); - rcu_read_unlock(); - - return id; -} - -static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) -{ - struct bch_snapshot a; - unsigned i; - int ret; - - for (i = 0; i < 3; i++) { - if (!s.parent != !s.skip[i]) - return false; - - if (!s.parent) - continue; - - ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); - if (bch2_err_matches(ret, ENOENT)) - return false; - if (ret) - return ret; - - if (a.tree != s.tree) - return false; - } - - return true; -} - -/* - * snapshot_tree pointer was incorrect: look up root snapshot node, make sure - * its snapshot_tree pointer is correct (allocate new one if necessary), then - * update this node's pointer to root node's pointer: - */ -static int snapshot_tree_ptr_repair(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_snapshot *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter root_iter; - struct bch_snapshot_tree s_t; - struct bkey_s_c_snapshot root; - struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; - int ret; - - root = bch2_bkey_get_iter_typed(trans, &root_iter, - BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_WITH_UPDATES, snapshot); - ret = bkey_err(root); - if (ret) - goto err; - - tree_id = le32_to_cpu(root.v->tree); - - ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { - u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u) ?: - snapshot_tree_create(trans, root_id, - bch2_snapshot_tree_oldest_subvol(c, root_id), - &tree_id); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - if (k.k->p.offset == root_id) - *s = u->v; - } - - if (k.k->p.offset != root_id) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - *s = u->v; - } -err: - bch2_trans_iter_exit(trans, &root_iter); - return ret; -} - -static int cmp_le32(__le32 l, __le32 r) -{ - return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); -} - -static int check_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct bch_snapshot v; - struct bkey_i_snapshot *u; - u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); - u32 real_depth; - struct printbuf buf = PRINTBUF; - bool should_have_subvol; - u32 i, id; - int ret = 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, bkey_val_bytes(k.k)); - - id = le32_to_cpu(s.parent); - if (id) { - ret = snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (le32_to_cpu(v.children[0]) != k.k->p.offset && - le32_to_cpu(v.children[1]) != k.k->p.offset) { - bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - for (i = 0; i < 2 && s.children[i]; i++) { - id = le32_to_cpu(s.children[i]); - - ret = snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot node %llu has nonexistent child %u", - k.k->p.offset, id); - if (ret) - goto err; - - if (le32_to_cpu(v.parent) != k.k->p.offset) { - bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, 0, false, &subvol); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { - bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - k.k->p.offset); - ret = -EINVAL; - goto err; - } - } else { - if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.subvol = 0; - s = u->v; - } - } - - ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, k, &s); - if (ret) - goto err; - } - ret = 0; - - real_depth = bch2_snapshot_depth(c, parent_id); - - if (le32_to_cpu(s.depth) != real_depth && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.depth = cpu_to_le32(real_depth); - s = u->v; - } - - ret = snapshot_skiplist_good(trans, s); - if (ret < 0) - goto err; - - if (!ret && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, "snapshot with bad skiplist field:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) - u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); - - bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); - s = u->v; - } - ret = 0; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_snapshots(struct bch_fs *c) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - /* - * We iterate backwards as checking/fixing the depth field requires that - * the parent's depth already be correct: - */ - ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(&trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(&trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); - return ret; -} - static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -881,7 +28,7 @@ static int check_subvol(struct btree_trans *trans, subvol = bkey_s_c_to_subvolume(k); snapid = le32_to_cpu(subvol.v->snapshot); - ret = snapshot_lookup(trans, snapid, &snapshot); + ret = bch2_snapshot_lookup(trans, snapid, &snapshot); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "subvolume %llu points to nonexistent snapshot %u", @@ -894,8 +41,7 @@ static int check_subvol(struct btree_trans *trans, ret = bch2_subvolume_delete(trans, iter->pos.offset); if (ret) - bch_err(c, "error deleting subvolume %llu: %s", - iter->pos.offset, bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); return ret ?: -BCH_ERR_transaction_restart_nested; } @@ -916,7 +62,8 @@ static int check_subvol(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, + if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, + c, subvol_not_master_and_not_snapshot, "subvolume %llu is not set as snapshot but is not master subvolume", k.k->p.offset)) { struct bkey_i_subvolume *s = @@ -940,485 +87,30 @@ int bch2_check_subvols(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(&trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); - return ret; -} - -void bch2_fs_snapshots_exit(struct bch_fs *c) -{ - kfree(rcu_dereference_protected(c->snapshots, true)); -} - -int bch2_snapshots_read(struct bch_fs *c) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(&trans, k))); + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_subvol(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; } -/* - * Mark a snapshot as deleted, for future cleanup: - */ -static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -{ - struct btree_iter iter; - struct bkey_i_snapshot *s; - int ret = 0; - - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); - return ret; - } - - /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) - goto err; - - SET_BCH_SNAPSHOT_DELETED(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct btree_iter tree_iter = (struct btree_iter) { NULL }; - struct bkey_s_c_snapshot s; - u32 parent_id; - unsigned i; - int ret = 0; - - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); - ret = bkey_err(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - - BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); - parent_id = le32_to_cpu(s.v->parent); - - if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(parent); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", parent_id); - goto err; - } - - for (i = 0; i < 2; i++) - if (le32_to_cpu(parent->v.children[i]) == id) - break; - - if (i == 2) - bch_err(c, "snapshot %u missing child pointer to %u", - parent_id, id); - else - parent->v.children[i] = 0; - - if (le32_to_cpu(parent->v.children[0]) < - le32_to_cpu(parent->v.children[1])) - swap(parent->v.children[0], - parent->v.children[1]); - } else { - /* - * We're deleting the root of a snapshot tree: update the - * snapshot_tree entry to point to the new root, or delete it if - * this is the last snapshot ID in this tree: - */ - struct bkey_i_snapshot_tree *s_t; - - BUG_ON(s.v->children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; - u32 depth = bch2_snapshot_depth(c, parent); - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; - goto err; - } - - n = bch2_bkey_alloc(trans, &iter, 0, snapshot); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.flags = 0; - n->v.parent = cpu_to_le32(parent); - n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.tree = cpu_to_le32(tree); - n->v.depth = cpu_to_le32(depth); - - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) - n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); - - bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); - SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - - ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); - if (ret) - goto err; - - new_snapids[i] = iter.pos.offset; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create new snapshot IDs as children of an existing snapshot ID: - */ -static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); - if (unlikely(ret)) { - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", parent); - return ret; - } - - if (n_parent->v.children[0] || n_parent->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } - - ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - goto err; - - n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); - n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); - n_parent->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create a snapshot node that is the root of a new tree: - */ -static int bch2_snapshot_node_create_tree(struct btree_trans *trans, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bkey_i_snapshot_tree *n_tree; - int ret; - - n_tree = __snapshot_tree_create(trans); - ret = PTR_ERR_OR_ZERO(n_tree) ?: - create_snapids(trans, 0, n_tree->k.p.offset, - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - return ret; - - n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); - n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); - return 0; -} - -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - BUG_ON((parent == 0) != (nr_snapids == 1)); - BUG_ON((parent != 0) != (nr_snapids == 2)); - - return parent - ? bch2_snapshot_node_create_children(trans, parent, - new_snapids, snapshot_subvols, nr_snapids) - : bch2_snapshot_node_create_tree(trans, - new_snapids, snapshot_subvols, nr_snapids); - -} - -static int snapshot_delete_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; - *last_pos = k.k->p; - - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} - -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) - return 0; - - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); - - ret = snapshot_live(trans, children[0]) ?: - snapshot_live(trans, children[1]); - if (ret < 0) - return ret; - - if (!ret) - return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); - return 0; -} +/* Subvolumes: */ -int bch2_delete_dead_snapshots(struct bch_fs *c) +int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) { - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_snapshot snap; - snapshot_id_list deleted = { 0 }; - u32 i, id; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { - ret = bch2_fs_read_write_early(c); - if (ret) { - bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); - return ret; - } - } - - bch2_trans_init(&trans, c, 0, 0); - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(&trans, &iter, k)); - if (ret) { - bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); - goto err; - } - - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(&trans, k)); - if (ret) { - bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); - goto err; - } - - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) { - bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); - goto err; - } - - for (id = 0; id < BTREE_ID_NR; id++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; - - if (!btree_type_has_snapshots(id)) - continue; - - ret = for_each_btree_key_commit(&trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); - - darray_exit(&equiv_seen); - - if (ret) { - bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); - goto err; - } - } - - for (i = 0; i < deleted.nr; i++) { - ret = commit_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, deleted.data[i])); - if (ret) { - bch_err(c, "error deleting snapshot %u: %s", - deleted.data[i], bch2_err_str(ret)); - goto err; - } - } - - clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -err: - darray_exit(&deleted); - bch2_trans_exit(&trans); - if (ret) - bch_err_fn(c, ret); + bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || + bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, + subvol_pos_bad, + "invalid pos"); +fsck_err: return ret; } -static void bch2_delete_dead_snapshots_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); - - if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) - bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_delete_dead_snapshots_async(struct bch_fs *c) -{ - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -} - -static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - struct bch_fs *c = trans->c; - - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) - return 0; - - bch2_delete_dead_snapshots_async(c); - return 0; -} - -/* Subvolumes: */ - -int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) -{ - if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX)) { - prt_printf(err, "invalid pos"); - return -BCH_ERR_invalid_bkey; - } - - return 0; -} - void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -1459,26 +151,27 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, { struct bch_snapshot snap; - return snapshot_lookup(trans, snapshot, &snap) ?: + return bch2_snapshot_lookup(trans, snapshot, &snap) ?: bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); } -int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, u32 *snapid) { struct btree_iter iter; - struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; int ret; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_CACHED| - BTREE_ITER_WITH_UPDATES); - ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume; + subvol = bch2_bkey_get_iter_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + subvolume); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); if (likely(!ret)) - *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); - else if (bch2_err_matches(ret, ENOENT)) - bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + *snapid = le32_to_cpu(subvol.v->snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1508,7 +201,12 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, } /* - * Scan for subvolumes with parent @subvolid_to_delete, reparent: + * Separate from the snapshot tree in the snapshots btree, we record the tree + * structure of how snapshot subvolumes were created - the parent subvolume of + * each snapshot subvolume. + * + * When a subvolume is deleted, we scan for child subvolumes and reparant them, + * to avoid dangling references: */ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) { @@ -1521,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ITER_CACHED, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.parent))); } @@ -1534,7 +232,6 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { struct btree_iter iter; struct bkey_s_c_subvolume subvol; - struct btree_trans_commit_hook *h; u32 snapid; int ret = 0; @@ -1550,22 +247,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_btree_delete_at(trans, &iter, 0); - if (ret) - goto err; - - ret = bch2_snapshot_node_set_deleted(trans, snapid); - if (ret) - goto err; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); - if (ret) - goto err; - - h->fn = bch2_delete_dead_snapshots_hook; - bch2_trans_commit_hook(trans, h); -err: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_snapshot_node_set_deleted(trans, snapid); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1573,7 +256,7 @@ err: static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { return bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); } @@ -1597,9 +280,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); for (id = s.data; id < s.data + s.nr; id++) { - ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id)); + ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); if (ret) { - bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting subvolume %u", *id); break; } } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 6905e91..a1003d3 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -7,227 +7,10 @@ enum bkey_invalid_flags; -void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); - -#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_tree_invalid, \ - .val_to_text = bch2_snapshot_tree_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); - -void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); - -#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_invalid, \ - .val_to_text = bch2_snapshot_to_text, \ - .atomic_trigger = bch2_mark_snapshot, \ - .min_val_size = 24, \ -}) - -static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -{ - return &t->s[U32_MAX - id]; -} - -static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -{ - return __snapshot_t(rcu_dereference(c->snapshots), id); -} - -static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -{ - rcu_read_lock(); - id = snapshot_t(c, id)->tree; - rcu_read_unlock(); - - return id; -} - -static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - return snapshot_t(c, id)->parent; -} - -static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - rcu_read_lock(); - id = __bch2_snapshot_parent_early(c, id); - rcu_read_unlock(); - - return id; -} - -static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; - - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) - panic("id %u depth=%u parent %u depth=%u\n", - id, snapshot_t(c, id)->depth, - parent, snapshot_t(c, parent)->depth); - - return parent; -#else - return snapshot_t(c, id)->parent; -#endif -} - -static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - rcu_read_lock(); - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; -} - -static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -{ - rcu_read_lock(); - while (n--) - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; -} - -static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -{ - u32 parent; - - rcu_read_lock(); - while ((parent = __bch2_snapshot_parent(c, id))) - id = parent; - rcu_read_unlock(); - - return id; -} - -static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) -{ - return snapshot_t(c, id)->equiv; -} - -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) -{ - rcu_read_lock(); - id = __bch2_snapshot_equiv(c, id); - rcu_read_unlock(); - - return id; -} - -static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -{ - return id == bch2_snapshot_equiv(c, id); -} - -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - bool ret; - - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; - rcu_read_unlock(); - - return ret; -} - -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); - -static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - return id == ancestor - ? true - : __bch2_snapshot_is_ancestor(c, id, ancestor); -} - -static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *t; - bool ret; - - rcu_read_lock(); - t = snapshot_t(c, id); - ret = (t->children[0]|t->children[1]) != 0; - rcu_read_unlock(); - - return ret; -} - -static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -{ - u32 *i; - - darray_for_each(*s, i) - if (*i == id) - return true; - return false; -} - -static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - u32 *i; - - darray_for_each(*s, i) - if (bch2_snapshot_is_ancestor(c, id, *i)) - return true; - return false; -} - -static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - int ret; - - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -int bch2_check_snapshot_trees(struct bch_fs *); -int bch2_check_snapshots(struct bch_fs *); int bch2_check_subvols(struct bch_fs *); -void bch2_fs_snapshots_exit(struct bch_fs *); -int bch2_snapshots_read(struct bch_fs *); - -int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, - unsigned, struct printbuf *); +int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ @@ -238,14 +21,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c) int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); -int bch2_snapshot_get_subvol(struct btree_trans *, u32, - struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); -/* only exported for tests: */ -int bch2_snapshot_node_create(struct btree_trans *, u32, - u32 *, u32 *, unsigned); - int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index 8683344..2d2e66a 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -20,7 +20,7 @@ struct snapshot_t { }; struct snapshot_table { - struct snapshot_t s[0]; + DECLARE_FLEX_ARRAY(struct snapshot_t, s); }; typedef struct { diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index c9a5a7c..f4cad90 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_update_interior.h" -#include "buckets.h" #include "checksum.h" #include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" #include "journal.h" -#include "journal_io.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" #include "recovery.h" #include "replicas.h" #include "quota.h" +#include "sb-clean.h" +#include "sb-errors.h" +#include "sb-members.h" #include "super-io.h" #include "super.h" #include "trace.h" @@ -24,6 +23,9 @@ #include #include +static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { +}; + struct bch2_metadata_version { u16 version; const char *name; @@ -95,7 +97,7 @@ const char * const bch2_sb_fields[] = { static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, struct printbuf *); -struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, +struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) { struct bch_sb_field *f; @@ -150,7 +152,7 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, void bch2_sb_field_delete(struct bch_sb_handle *sb, enum bch_sb_field_type type) { - struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); if (f) __bch2_sb_field_resize(sb, f, 0); @@ -162,7 +164,8 @@ void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->mode); + blkdev_put(sb->bdev, sb->holder); + kfree(sb->holder); kfree(sb->sb); memset(sb, 0, sizeof(*sb)); @@ -183,7 +186,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (sb->sb && sb->buffer_size >= new_buffer_size) return 0; - if (sb->have_layout) { + if (sb->sb && sb->have_layout) { u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { @@ -199,8 +202,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (dynamic_fault("bcachefs:add:super_realloc")) return -BCH_ERR_ENOMEM_sb_realloc_injected; + new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); + if (!new_sb) + return -BCH_ERR_ENOMEM_sb_buf_realloc; + + sb->sb = new_sb; + if (sb->have_bio) { - unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE); + unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) @@ -212,21 +221,16 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) sb->bio = bio; } - new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); - if (!new_sb) - return -BCH_ERR_ENOMEM_sb_buf_realloc; - - sb->sb = new_sb; sb->buffer_size = new_buffer_size; return 0; } -struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, +struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, enum bch_sb_field_type type, unsigned u64s) { - struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ssize_t d = -old_u64s + u64s; @@ -243,16 +247,16 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ for_each_online_member(ca, c, i) { - struct bch_sb_handle *sb = &ca->disk_sb; + struct bch_sb_handle *dev_sb = &ca->disk_sb; - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { percpu_ref_put(&ca->ref); return NULL; } } } - f = bch2_sb_field_get(sb->sb, type); + f = bch2_sb_field_get_id(sb->sb, type); f = __bch2_sb_field_resize(sb, f, u64s); if (f) f->type = cpu_to_le32(type); @@ -352,7 +356,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; - struct bch_sb_field_members *mi; + struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; u16 block_size; int ret; @@ -381,7 +385,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, } if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { - prt_printf(out, "Bad intenal UUID (got zeroes)"); + prt_printf(out, "Bad internal UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; } @@ -455,7 +459,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, } /* members must be validated first: */ - mi = bch2_sb_get_members(sb); + mi = bch2_sb_field_get(sb, members_v1); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); return -BCH_ERR_invalid_sb_members_missing; @@ -466,7 +470,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return ret; vstruct_for_each(sb, f) { - if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) continue; ret = bch2_sb_field_validate(sb, f, out); @@ -482,7 +486,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb.sb; - struct bch_sb_field_members *mi = bch2_sb_get_members(src); struct bch_dev *ca; unsigned i; @@ -508,8 +511,10 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); - for_each_member_device(ca, c, i) - ca->mi = bch2_mi_to_cpu(mi->members + i); + for_each_member_device(ca, c, i) { + struct bch_member m = bch2_sb_member_get(src, i); + ca->mi = bch2_mi_to_cpu(&m); + } } static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) @@ -542,18 +547,20 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) continue; - src_f = bch2_sb_field_get(src, i); - dst_f = bch2_sb_field_get(dst, i); + src_f = bch2_sb_field_get_id(src, i); + dst_f = bch2_sb_field_get_id(dst, i); d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - (dst_f ? le32_to_cpu(dst_f->u64s) : 0); if (d > 0) { - int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d); + int ret = bch2_sb_realloc(dst_handle, + le32_to_cpu(dst_handle->sb->u64s) + d); + if (ret) return ret; dst = dst_handle->sb; - dst_f = bch2_sb_field_get(dst, i); + dst_f = bch2_sb_field_get_id(dst, i); } dst_f = __bch2_sb_field_resize(dst_handle, dst_f, @@ -662,27 +669,30 @@ int bch2_read_super(const char *path, struct bch_opts *opts, retry: #endif memset(sb, 0, sizeof(*sb)); - sb->mode = FMODE_READ; + sb->mode = BLK_OPEN_READ; sb->have_bio = true; + sb->holder = kmalloc(1, GFP_KERNEL); + if (!sb->holder) + return -ENOMEM; #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) - sb->mode |= FMODE_BUFFERED; + sb->mode |= BLK_OPEN_BUFFERED; #endif if (!opt_get(*opts, noexcl)) - sb->mode |= FMODE_EXCL; + sb->mode |= BLK_OPEN_EXCL; if (!opt_get(*opts, nochanges)) - sb->mode |= FMODE_WRITE; + sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); if (IS_ERR(sb->bdev) && PTR_ERR(sb->bdev) == -EACCES && opt_get(*opts, read_only)) { - sb->mode &= ~FMODE_WRITE; + sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); if (!IS_ERR(sb->bdev)) opt_set(*opts, nochanges, true); } @@ -711,7 +721,7 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); printbuf_reset(&err); @@ -773,7 +783,7 @@ got_super: ret = bch2_sb_validate(sb, &err, READ); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); goto err_no_print; } @@ -781,7 +791,7 @@ out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", path, err.buf); err_no_print: bch2_free_super(sb); @@ -796,7 +806,12 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "superblock %s error: %s", + bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -883,6 +898,9 @@ int bch2_write_super(struct bch_fs *c) SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); bch2_sb_counters_from_cpu(c); + bch2_sb_members_from_cpu(c); + bch2_sb_members_cpy_v2_v1(&c->disk_sb); + bch2_sb_errors_from_cpu(c); for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -1005,235 +1023,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) mutex_unlock(&c->sb_lock); } -/* BCH_SB_FIELD_members: */ - -static int bch2_sb_members_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_members *mi = field_to_type(f, members); - unsigned i; - - if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) { - prt_printf(err, "too many devices for section size"); - return -BCH_ERR_invalid_sb_members; - } - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - - if (!bch2_member_exists(m)) - continue; - - if (le64_to_cpu(m->nbuckets) > LONG_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", - i, le64_to_cpu(m->nbuckets), LONG_MAX); - return -BCH_ERR_invalid_sb_members; - } - - if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { - prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", - i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m->bucket_size) < - le16_to_cpu(sb->block_size)) { - prt_printf(err, "device %u: bucket size %u smaller than block size %u", - i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m->bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) { - prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", - i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -BCH_ERR_invalid_sb_members; - } - } - - return 0; -} - -static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members *mi = field_to_type(f, members); - struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); - unsigned i; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m->bucket_size); - u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; - - if (!bch2_member_exists(m)) - continue; - - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); - - printbuf_indent_add(out, 2); - - prt_printf(out, "UUID:"); - prt_tab(out); - pr_uuid(out, m->uuid.b); - prt_newline(out); - - prt_printf(out, "Size:"); - prt_tab(out); - prt_units_u64(out, device_size << 9); - prt_newline(out); - - prt_printf(out, "Bucket size:"); - prt_tab(out); - prt_units_u64(out, bucket_size << 9); - prt_newline(out); - - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); - prt_newline(out); - - prt_printf(out, "Last mount:"); - prt_tab(out); - if (m->last_mount) - pr_time(out, le64_to_cpu(m->last_mount)); - else - prt_printf(out, "(never)"); - prt_newline(out); - - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", - BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(m)] - : "unknown"); - prt_newline(out); - - prt_printf(out, "Label:"); - prt_tab(out); - if (BCH_MEMBER_GROUP(m)) { - unsigned idx = BCH_MEMBER_GROUP(m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { - prt_printf(out, "(none)"); - } - prt_newline(out); - - prt_printf(out, "Data allowed:"); - prt_tab(out); - if (BCH_MEMBER_DATA_ALLOWED(m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Has data:"); - prt_tab(out); - if (data_have) - prt_bitflags(out, bch2_data_types, data_have); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); - prt_newline(out); - - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } -} - -static const struct bch_sb_field_ops bch_sb_field_ops_members = { - .validate = bch2_sb_members_validate, - .to_text = bch2_sb_members_to_text, -}; - -/* BCH_SB_FIELD_crypt: */ - -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - if (BCH_CRYPT_KDF_TYPE(crypt)) { - prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - return 0; -} - -static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_crypt_validate, - .to_text = bch2_sb_crypt_to_text, -}; - -/* BCH_SB_FIELD_clean: */ - -int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) -{ - struct jset_entry *entry; - int ret; - - for (entry = clean->start; - entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); - if (ret) - return ret; - } - - return 0; -} - /* Downgrade if superblock is at a higher version than currently supported: */ void bch2_sb_maybe_downgrade(struct bch_fs *c) { @@ -1260,232 +1049,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); } -int bch2_fs_mark_dirty(struct bch_fs *c) -{ - int ret; - - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_sb_maybe_downgrade(c); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - -void bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry **end, - u64 journal_seq) -{ - struct bch_dev *ca; - unsigned i, dev; - - percpu_down_read(&c->mark_lock); - - if (!journal_seq) { - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - } else { - bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_key_version; - u->v = cpu_to_le64(atomic64_read(&c->key_version)); - } - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_reserved; - u->entry.level = i; - u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = - cpu_replicas_entry(&c->replicas, i); - struct jset_entry_data_usage *u = - container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), - struct jset_entry_data_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_data_usage; - u->v = cpu_to_le64(c->usage_base->replicas[i]); - unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), - "embedded variable length struct"); - } - - for_each_member_device(ca, c, dev) { - unsigned b = sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; - struct jset_entry_dev_usage *u = - container_of(jset_entry_init(end, b), - struct jset_entry_dev_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(dev); - u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); - - for (i = 0; i < BCH_DATA_NR; i++) { - u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); - u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); - u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); - } - } - - percpu_up_read(&c->mark_lock); - - for (i = 0; i < 2; i++) { - struct jset_entry_clock *clock = - container_of(jset_entry_init(end, sizeof(*clock)), - struct jset_entry_clock, entry); - - clock->entry.type = BCH_JSET_ENTRY_clock; - clock->rw = i; - clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); - } -} - -void bch2_fs_mark_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *sb_clean; - struct jset_entry *entry; - unsigned u64s; - int ret; - - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; - - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - - u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - - sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); - if (!sb_clean) { - bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; - } - - sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); - - /* Trying to catch outstanding bug: */ - BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - - entry = sb_clean->start; - bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, entry); - BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - - memset(entry, 0, - vstruct_end(&sb_clean->field) - (void *) entry); - - /* - * this should be in the write path, and we should be validating every - * superblock section: - */ - ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); - if (ret) { - bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; - } - - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); -} - -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&clean->field), sizeof(*clean)); - return -BCH_ERR_invalid_sb_clean; - } - - return 0; -} - -static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - struct jset_entry *entry; - - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - - bch2_journal_entry_to_text(out, NULL, entry); - prt_newline(out); - } -} - -static const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_clean_validate, - .to_text = bch2_sb_clean_to_text, -}; - static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, @@ -1572,7 +1135,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - struct bch_sb_field_members *mi; struct bch_sb_field *f; u64 fields_have = 0; unsigned nr_devices = 0; @@ -1580,15 +1142,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, if (!out->nr_tabstops) printbuf_tabstop_push(out, 44); - mi = bch2_sb_get_members(sb); - if (mi) { - struct bch_member *m; - - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) - nr_devices += bch2_member_exists(m); - } + for (int i = 0; i < sb->nr_devices; i++) + nr_devices += bch2_dev_exists(sb, i); prt_printf(out, "External UUID:"); prt_tab(out); @@ -1628,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Created:"); prt_tab(out); if (sb->time_base_lo) - pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 904adea..f5abd10 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -6,6 +6,7 @@ #include "eytzinger.h" #include "super_types.h" #include "super.h" +#include "sb-members.h" #include @@ -22,31 +23,24 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c, unsigned, unsigned); -struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); +static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) +{ + return le32_to_cpu(f->u64s) * sizeof(u64); +} #define field_to_type(_f, _name) \ container_of_or_null(_f, struct bch_sb_field_##_name, field) -#define x(_name, _nr) \ -static inline struct bch_sb_field_##_name * \ -bch2_sb_get_##_name(struct bch_sb *sb) \ -{ \ - return field_to_type(bch2_sb_field_get(sb, \ - BCH_SB_FIELD_##_name), _name); \ -} \ - \ -static inline struct bch_sb_field_##_name * \ -bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ -{ \ - return field_to_type(bch2_sb_field_resize(sb, \ - BCH_SB_FIELD_##_name, u64s), _name); \ -} +struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type); +#define bch2_sb_field_get(_sb, _name) \ + field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name) + +struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, + enum bch_sb_field_type, unsigned); +#define bch2_sb_field_resize(_sb, _name, _u64s) \ + field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) -BCH_SB_FIELDS() -#undef x +void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; @@ -58,6 +52,7 @@ struct bch_sb_field_ops { static inline __le64 bch2_sb_magic(struct bch_fs *c) { __le64 ret; + memcpy(&ret, &c->sb.uuid, sizeof(ret)); return ret; } @@ -88,52 +83,9 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -/* BCH_SB_FIELD_members: */ - -static inline bool bch2_member_exists(struct bch_member *m) -{ - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); -} - -static inline bool bch2_dev_exists(struct bch_sb *sb, - struct bch_sb_field_members *mi, - unsigned dev) -{ - return dev < sb->nr_devices && - bch2_member_exists(&mi->members[dev]); -} - -static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .group = BCH_MEMBER_GROUP(mi), - .state = BCH_MEMBER_STATE(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), - .durability = BCH_MEMBER_DURABILITY(mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), - }; -} - -/* BCH_SB_FIELD_clean: */ - -void bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry **, u64); - -int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); - void bch2_sb_maybe_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); -int bch2_fs_mark_dirty(struct bch_fs *); -void bch2_fs_mark_clean(struct bch_fs *); - void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index eee5696..bb94510 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -13,6 +13,7 @@ #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -30,9 +31,12 @@ #include "error.h" #include "fs.h" #include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" @@ -44,6 +48,10 @@ #include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "sb-clean.h" +#include "sb-errors.h" +#include "sb-members.h" +#include "snapshot.h" #include "subvolume.h" #include "super.h" #include "super-io.h" @@ -63,6 +71,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); +MODULE_DESCRIPTION("bcachefs filesystem"); #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ @@ -392,6 +401,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); + ret = bch2_sb_members_v2_init(c); + if (ret) + goto err; + ret = bch2_fs_mark_dirty(c); if (ret) goto err; @@ -416,6 +429,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return ret; } + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; + if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -425,7 +442,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { + for (i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } @@ -460,19 +477,23 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { unsigned i; - int cpu; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_free_pending_node_rewrites(c); + bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); + bch2_fs_fs_io_direct_exit(c); + bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); - bch2_fs_io_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_io_write_exit(c); + bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); @@ -489,12 +510,7 @@ static void __bch2_fs_free(struct bch_fs *c) percpu_free_rwsem(&c->mark_lock); free_percpu(c->online_reserved); - if (c->btree_paths_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); - darray_exit(&c->btree_roots_extra); - free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -568,13 +584,6 @@ void __bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); - - for (i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - - if (ca) - bch2_free_super(&ca->disk_sb); - } } void bch2_fs_free(struct bch_fs *c) @@ -588,9 +597,14 @@ void bch2_fs_free(struct bch_fs *c) closure_sync(&c->cl); closure_debug_destroy(&c->cl); - for (i = 0; i < c->sb.nr_devices; i++) - if (c->devs[i]) - bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); + for (i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + + if (ca) { + bch2_free_super(&ca->disk_sb); + bch2_dev_free(ca); + } + } bch_verbose(c, "shutdown complete"); @@ -627,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c) ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: +#endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { @@ -655,7 +671,6 @@ err: static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { - struct bch_sb_field_members *mi; struct bch_fs *c; struct printbuf name = PRINTBUF; unsigned i, iter_size; @@ -702,6 +717,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); bch2_fs_move_init(c); + bch2_fs_sb_errors_init_early(c); INIT_LIST_HEAD(&c->list); @@ -709,6 +725,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->snapshot_table_lock); + init_rwsem(&c->snapshot_create_lock); spin_lock_init(&c->btree_write_error_lock); @@ -717,8 +734,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->journal_iters); - INIT_LIST_HEAD(&c->fsck_errors); - mutex_init(&c->fsck_error_lock); + INIT_LIST_HEAD(&c->fsck_error_msgs); + mutex_init(&c->fsck_error_msgs_lock); seqcount_init(&c->gc_pos_lock); @@ -735,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); @@ -780,6 +796,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); @@ -817,7 +834,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || @@ -828,6 +844,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) } ret = bch2_fs_counters_init(c) ?: + bch2_fs_sb_errors_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: @@ -839,18 +856,20 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_io_write_init(c) ?: bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_fsio_init(c); + bch2_fs_fsio_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_fs_io_direct_init(c); if (ret) goto err; - mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb.sb, mi, i) && + if (bch2_dev_exists(c->disk_sb.sb, i) && bch2_dev_alloc(c, i)) { ret = -EEXIST; goto err; @@ -915,7 +934,6 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - struct bch_sb_field_members *mi; struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); unsigned i; @@ -929,12 +947,14 @@ int bch2_fs_start(struct bch_fs *c) mutex_lock(&c->sb_lock); - for_each_online_member(ca, c, i) - bch2_sb_from_fs(c, ca); + ret = bch2_sb_members_v2_init(c); + if (ret) { + mutex_unlock(&c->sb_lock); + goto err; + } - mi = bch2_sb_get_members(c->disk_sb.sb); for_each_online_member(ca, c, i) - mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); + bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); mutex_unlock(&c->sb_lock); @@ -942,12 +962,6 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { - mutex_lock(&c->btree_transaction_stats[i].lock); - bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); - mutex_unlock(&c->btree_transaction_stats[i].lock); - } - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); @@ -981,22 +995,18 @@ out: up_write(&c->state_lock); return ret; err: - bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "starting filesystem"); goto out; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { - struct bch_sb_field_members *sb_mi; - - sb_mi = bch2_sb_get_members(sb); - if (!sb_mi) - return -BCH_ERR_member_info_missing; + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) return -BCH_ERR_mismatched_block_size; - if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < + if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) return -BCH_ERR_bucket_size_too_small; @@ -1007,12 +1017,11 @@ static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) { struct bch_sb *newest = le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; - struct bch_sb_field_members *mi = bch2_sb_get_members(newest); if (!uuid_equal(&fs->uuid, &sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(newest, mi, sb->dev_idx)) + if (!bch2_dev_exists(newest, sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->block_size != sb->block_size) @@ -1127,6 +1136,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, struct bch_member *member) { struct bch_dev *ca; + unsigned i; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) @@ -1144,6 +1154,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_time_stats_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); + + for (i = 0; i < ARRAY_SIZE(member->errors); i++) + atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); + ca->uuid = member->uuid; ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, @@ -1182,15 +1196,14 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { - struct bch_member *member = - bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; + struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; - ca = __bch2_dev_alloc(c, member); + ca = __bch2_dev_alloc(c, &member); if (!ca) goto err; @@ -1228,8 +1241,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) /* Commit: */ ca->disk_sb = *sb; - if (sb->mode & FMODE_EXCL) - ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); ca->dev = ca->disk_sb.bdev->bd_dev; @@ -1327,7 +1338,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, static bool bch2_fs_may_start(struct bch_fs *c) { - struct bch_sb_field_members *mi; struct bch_dev *ca; unsigned i, flags = 0; @@ -1340,10 +1350,9 @@ static bool bch2_fs_may_start(struct bch_fs *c) if (!c->opts.degraded && !c->opts.very_degraded) { mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) + if (!bch2_dev_exists(c->disk_sb.sb, i)) continue; ca = bch_dev_locked(c, i); @@ -1383,7 +1392,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { - struct bch_sb_field_members *mi; + struct bch_member *m; int ret = 0; if (ca->mi.state == new_state) @@ -1398,8 +1407,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch_notice(ca, "%s", bch2_member_states[new_state]); mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); - SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_STATE(m, new_state); bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1448,14 +1457,14 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); if (ret) - bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "removing dev alloc info"); return ret; } int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { - struct bch_sb_field_members *mi; + struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; int ret; @@ -1477,31 +1486,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_dev_data_drop(c, ca->dev_idx, flags); if (ret) { - bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "dropping data"); goto err; } ret = bch2_dev_remove_alloc(c, ca); if (ret) { - bch_err(ca, "Remove failed, error deleting alloc info"); + bch_err_msg(ca, ret, "deleting alloc info"); goto err; } ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); if (ret) { - bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "flushing journal"); goto err; } ret = bch2_journal_flush(&c->journal); if (ret) { - bch_err(ca, "Remove failed, journal error"); + bch_err(ca, "journal error"); goto err; } ret = bch2_replicas_gc2(c); if (ret) { - bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "in replicas_gc2()"); goto err; } @@ -1543,8 +1552,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * this device must be gone: */ mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); - memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); + m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); @@ -1567,7 +1576,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; - struct bch_sb_field_members *mi; + struct bch_sb_field_members_v2 *mi; struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; struct printbuf errbuf = PRINTBUF; @@ -1576,14 +1585,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_read_super(path, &opts, &sb); if (ret) { - bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "reading super"); goto err; } - dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { - bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); if (label.allocation_failure) { ret = -ENOMEM; goto err; @@ -1592,13 +1601,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_dev_may_add(sb.sb, c); if (ret) { - bch_err(c, "device add error: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); goto err; } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { - bch2_free_super(&sb); ret = -ENOMEM; goto err; } @@ -1606,14 +1614,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_dev_usage_init(ca); ret = __bch2_dev_attach_bdev(ca, &sb); - if (ret) { - bch2_dev_free(ca); + if (ret) goto err; - } ret = bch2_dev_journal_alloc(ca); if (ret) { - bch_err(c, "device add error: journal alloc failed"); + bch_err_msg(c, ret, "allocating journal"); goto err; } @@ -1622,48 +1628,40 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_sb_from_fs(c, ca); if (ret) { - bch_err(c, "device add error: new device superblock too small"); - goto err_unlock; - } - - mi = bch2_sb_get_members(ca->disk_sb.sb); - - if (!bch2_sb_resize_members(&ca->disk_sb, - le32_to_cpu(mi->field.u64s) + - sizeof(dev_mi) / sizeof(u64))) { - bch_err(c, "device add error: new device superblock too small"); - ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; - mi = bch2_sb_get_members(c->disk_sb.sb); for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) + if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) goto have_slot; no_slot: - bch_err(c, "device add error: already have maximum number of devices"); ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - u64s = (sizeof(struct bch_sb_field_members) + - sizeof(struct bch_member) * nr_devices) / sizeof(u64); - mi = bch2_sb_resize_members(&c->disk_sb, u64s); + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) { - bch_err(c, "device add error: no room in superblock for member info"); ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); /* success: */ - mi->members[dev_idx] = dev_mi; - mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); + *m = dev_mi; + m->last_mount = cpu_to_le64(ktime_get_real_seconds()); c->disk_sb.sb->nr_devices = nr_devices; ca->disk_sb.sb->dev_idx = dev_idx; @@ -1672,7 +1670,7 @@ have_slot: if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); if (ret) { - bch_err(c, "device add error: error setting label"); + bch_err_msg(c, ret, "creating new label"); goto err_unlock; } } @@ -1684,13 +1682,13 @@ have_slot: ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "marking new superblock"); goto err_late; } ret = bch2_fs_freespace_init(c); if (ret) { - bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "initializing free space"); goto err_late; } @@ -1723,7 +1721,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; - struct bch_sb_field_members *mi; struct bch_dev *ca; unsigned dev_idx; int ret; @@ -1740,7 +1737,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); if (ret) { - bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret)); + bch_err_msg(c, ret, "bringing %s online", path); goto err; } @@ -1752,27 +1749,33 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", - path, bch2_err_str(ret)); + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); goto err; } if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); + if (!ca->mi.freespace_initialized) { + ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err; + } - mi->members[ca->dev_idx].last_mount = - cpu_to_le64(ktime_get_real_seconds()); + if (!ca->journal.nr) { + ret = bch2_dev_journal_alloc(ca); + bch_err_msg(ca, ret, "allocating journal"); + if (ret) + goto err; + } + mutex_lock(&c->sb_lock); + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); - ret = bch2_fs_freespace_init(c); - if (ret) - bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); - up_write(&c->state_lock); return 0; err: @@ -1805,10 +1808,12 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bch_member *mi; + struct bch_member *m; + u64 old_nbuckets; int ret = 0; down_write(&c->state_lock); + old_nbuckets = ca->mi.nbuckets; if (nbuckets < ca->mi.nbuckets) { bch_err(ca, "Cannot shrink yet"); @@ -1826,7 +1831,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = bch2_dev_buckets_resize(c, ca, nbuckets); if (ret) { - bch_err(ca, "Resize error: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "resizing buckets"); goto err; } @@ -1835,12 +1840,24 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) goto err; mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; - mi->nbuckets = cpu_to_le64(nbuckets); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(nbuckets); bch2_write_super(c); mutex_unlock(&c->sb_lock); + if (ca->mi.freespace_initialized) { + ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + if (ret) + goto err; + + /* + * XXX: this is all wrong transactionally - we'll be able to do + * this correctly after the disk space accounting rewrite + */ + ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets; + } + bch2_recalc_capacity(c); err: up_write(&c->state_lock); @@ -1869,10 +1886,9 @@ found: struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { - struct bch_sb_handle *sb = NULL; + DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - struct bch_sb_field_members *mi; - unsigned i, best_sb = 0; + struct bch_sb_handle *sb, *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1884,51 +1900,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } - sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) { - ret = -ENOMEM; + ret = darray_make_room(&sbs, nr_devices); + if (ret) goto err; - } - for (i = 0; i < nr_devices; i++) { - ret = bch2_read_super(devices[i], &opts, &sb[i]); + for (unsigned i = 0; i < nr_devices; i++) { + struct bch_sb_handle sb = { NULL }; + + ret = bch2_read_super(devices[i], &opts, &sb); if (ret) goto err; + BUG_ON(darray_push(&sbs, sb)); } - for (i = 1; i < nr_devices; i++) - if (le64_to_cpu(sb[i].sb->seq) > - le64_to_cpu(sb[best_sb].sb->seq)) - best_sb = i; - - mi = bch2_sb_get_members(sb[best_sb].sb); + darray_for_each(sbs, sb) + if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + best = sb; - i = 0; - while (i < nr_devices) { - if (i != best_sb && - !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb[i].bdev); - bch2_free_super(&sb[i]); - array_remove_item(sb, nr_devices, i); + darray_for_each_reverse(sbs, sb) { + if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { + pr_info("%pg has been removed, skipping", sb->bdev); + bch2_free_super(sb); + darray_remove_item(&sbs, sb); + best -= best > sb; continue; } - ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; - i++; } - c = bch2_fs_alloc(sb[best_sb].sb, opts); - if (IS_ERR(c)) { - ret = PTR_ERR(c); + c = bch2_fs_alloc(best->sb, opts); + ret = PTR_ERR_OR_ZERO(c); + if (ret) goto err; - } down_write(&c->state_lock); - for (i = 0; i < nr_devices; i++) { - ret = bch2_dev_attach_bdev(c, &sb[i]); + darray_for_each(sbs, sb) { + ret = bch2_dev_attach_bdev(c, sb); if (ret) { up_write(&c->state_lock); goto err; @@ -1947,7 +1958,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } out: - kfree(sb); + darray_for_each(sbs, sb) + bch2_free_super(sb); + darray_exit(&sbs); printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; @@ -1957,9 +1970,6 @@ err_print: err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); - if (sb) - for (i = 0; i < nr_devices; i++) - bch2_free_super(&sb[i]); c = ERR_PTR(ret); goto out; } @@ -2000,6 +2010,7 @@ err: BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM +__maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; module_param_named(version, bch2_metadata_version, uint, 0400); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 36bcb9e..bf762df 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -8,220 +8,6 @@ #include -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return div_u64(s, ca->mi.bucket_size); -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) * ca->mi.bucket_size; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) -{ - return div_u64_rem(s, ca->mi.bucket_size, offset); -} - -static inline bool bch2_dev_is_online(struct bch_dev *ca) -{ - return !percpu_ref_is_zero(&ca->io_ref); -} - -static inline bool bch2_dev_is_readable(struct bch_dev *ca) -{ - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -} - -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - -static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -{ - return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -} - -static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, - unsigned dev) -{ - unsigned i; - - for (i = 0; i < devs.nr; i++) - if (devs.devs[i] == dev) - return true; - - return false; -} - -static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, - unsigned dev) -{ - unsigned i; - - for (i = 0; i < devs->nr; i++) - if (devs->devs[i] == dev) { - array_remove_item(devs->devs, devs->nr, i); - return; - } -} - -static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, - unsigned dev) -{ - if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; - } -} - -static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -{ - return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -} - -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - const struct bch_devs_mask *mask) -{ - struct bch_dev *ca = NULL; - - while ((*iter = mask - ? find_next_bit(mask->d, c->sb.nr_devices, *iter) - : *iter) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], - lockdep_is_held(&c->state_lock)))) - (*iter)++; - - return ca; -} - -#define for_each_member_device_rcu(ca, c, iter, mask) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) - -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -{ - struct bch_dev *ca; - - rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter, NULL))) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch2_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) - -static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) -{ - struct bch_dev *ca; - - rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter, NULL)) && - (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; - rcu_read_unlock(); - - return ca; -} - -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) - -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) - -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) - -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) - -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -{ - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); - - return rcu_dereference_check(c->devs[idx], 1); -} - -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -{ - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); - - return rcu_dereference_protected(c->devs[idx], - lockdep_is_held(&c->sb_lock) || - lockdep_is_held(&c->state_lock)); -} - -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - struct bch_dev *ca; - unsigned i; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(ca, c, i) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} - struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 89419fc..7dda498 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -6,8 +6,9 @@ struct bch_sb_handle { struct bch_sb *sb; struct block_device *bdev; struct bio *bio; + void *holder; size_t buffer_size; - fmode_t mode; + blk_mode_t mode; unsigned have_layout:1; unsigned have_bio:1; unsigned fs_sb:1; @@ -36,16 +37,4 @@ struct bch_member_cpu { u8 valid; }; -struct bch_disk_group_cpu { - bool deleted; - u16 parent; - struct bch_devs_mask devs; -}; - -struct bch_disk_groups_cpu { - struct rcu_head rcu; - unsigned nr; - struct bch_disk_group_cpu entries[]; -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 740305e..8df45da 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -113,10 +113,6 @@ do { \ prt_human_readable_s64(out, val); \ } while (0) -#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -#define var_print(_var) sysfs_print(_var, var(_var)) -#define var_hprint(_var) sysfs_hprint(_var, var(_var)) - #define sysfs_strtoul(file, var) \ do { \ if (attr == &sysfs_ ## file) \ @@ -139,30 +135,6 @@ do { \ _v; \ }) -#define strtoul_restrict_or_return(cp, min, max) \ -({ \ - unsigned long __v = 0; \ - int _r = strtoul_safe_restrict(cp, __v, min, max); \ - if (_r) \ - return _r; \ - __v; \ -}) - -#define strtoi_h_or_return(cp) \ -({ \ - u64 _v; \ - int _r = strtoi_h(cp, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -#define sysfs_hatoi(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoi_h(buf, &var) ?: (ssize_t) size; \ -} while (0) - write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); @@ -177,7 +149,9 @@ read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); rw_attribute(durability); -read_attribute(iodone); +read_attribute(io_done); +read_attribute(io_errors); +write_attribute(io_errors_reset); read_attribute(io_latency_read); read_attribute(io_latency_write); @@ -240,7 +214,7 @@ read_attribute(copy_gc_wait); rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); -read_attribute(rebalance_work); +read_attribute(rebalance_status); rw_attribute(promote_whole_extents); read_attribute(new_stripes); @@ -248,7 +222,6 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); -read_attribute(data_jobs); read_attribute(moving_ctxts); #ifdef CONFIG_BCACHEFS_TESTS @@ -281,7 +254,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; enum btree_id id; @@ -292,18 +265,18 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c incompressible_sectors = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; - int ret; + int ret = 0; if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); for (id = 0; id < BTREE_ID_NR; id++) { if (!btree_type_has_ptrs(id)) continue; - for_each_btree_key(&trans, iter, id, POS_MIN, + for_each_btree_key(trans, iter, id, POS_MIN, BTREE_ITER_ALL_SNAPSHOTS, k, ret) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -337,10 +310,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c else if (compressed) nr_compressed_extents++; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; @@ -370,7 +343,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); bch2_bpos_to_text(out, c->gc_gens_pos); prt_printf(out, "\n"); } @@ -415,8 +388,8 @@ SHOW(bch2_fs) if (attr == &sysfs_copy_gc_wait) bch2_copygc_wait_to_text(out, c); - if (attr == &sysfs_rebalance_work) - bch2_rebalance_work_to_text(out, c); + if (attr == &sysfs_rebalance_status) + bch2_rebalance_status_to_text(out, c); sysfs_print(promote_whole_extents, c->promote_whole_extents); @@ -458,9 +431,6 @@ SHOW(bch2_fs) if (attr == &sysfs_io_timers_write) bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - if (attr == &sysfs_data_jobs) - bch2_data_jobs_to_text(out, c); - if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); @@ -678,10 +648,9 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_copy_gc_wait, &sysfs_rebalance_enabled, - &sysfs_rebalance_work, + &sysfs_rebalance_status, sysfs_pd_controller_files(rebalance), - &sysfs_data_jobs, &sysfs_moving_ctxts, &sysfs_internal_uuid, @@ -740,10 +709,8 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_by_id(&c->opts, id, v); if ((id == Opt_background_target || - id == Opt_background_compression) && v) { - bch2_rebalance_add_work(c, S64_MAX); - rebalance_wakeup(c); - } + id == Opt_background_compression) && v) + bch2_set_rebalance_needs_scan(c, 0); ret = size; err: @@ -915,7 +882,7 @@ static const char * const bch2_rw[] = { NULL }; -static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) +static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) { int rw, i; @@ -943,13 +910,8 @@ SHOW(bch2_dev) sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { - if (ca->mi.group) { - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, c->disk_sb.sb, - ca->mi.group - 1); - mutex_unlock(&c->sb_lock); - } - + if (ca->mi.group) + bch2_disk_path_to_text(out, c, ca->mi.group - 1); prt_char(out, '\n'); } @@ -963,8 +925,11 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_iodone) - dev_iodone_to_text(out, ca); + if (attr == &sysfs_io_done) + dev_io_done_to_text(out, ca); + + if (attr == &sysfs_io_errors) + bch2_dev_io_errors_to_text(out, ca); sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); @@ -995,7 +960,7 @@ STORE(bch2_dev) bool v = strtoul_or_return(buf); mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); if (v != BCH_MEMBER_DISCARD(mi)) { SET_BCH_MEMBER_DISCARD(mi, v); @@ -1008,9 +973,9 @@ STORE(bch2_dev) u64 v = strtoul_or_return(buf); mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - if (v != BCH_MEMBER_DURABILITY(mi)) { + if (v + 1 != BCH_MEMBER_DURABILITY(mi)) { SET_BCH_MEMBER_DURABILITY(mi, v + 1); bch2_write_super(c); } @@ -1031,6 +996,9 @@ STORE(bch2_dev) return ret; } + if (attr == &sysfs_io_errors_reset) + bch2_dev_errors_reset(ca); + return size; } SYSFS_OPS(bch2_dev); @@ -1048,7 +1016,9 @@ struct attribute *bch2_dev_files[] = { &sysfs_label, &sysfs_has_data, - &sysfs_iodone, + &sysfs_io_done, + &sysfs_io_errors, + &sysfs_io_errors_reset, &sysfs_io_latency_read, &sysfs_io_latency_write, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 1d4b0a5..2fc9e60 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -4,7 +4,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "journal_reclaim.h" -#include "subvolume.h" +#include "snapshot.h" #include "tests.h" #include "linux/kthread.h" @@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c) static int test_delete(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -39,44 +39,40 @@ static int test_delete(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &k.k_i, 0)); - if (ret) { - bch_err_msg(c, ret, "update error"); + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) goto err; - } pr_info("deleting once"); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); - if (ret) { - bch_err_msg(c, ret, "delete error (first)"); + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (first)"); + if (ret) goto err; - } pr_info("deleting twice"); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); - if (ret) { - bch_err_msg(c, ret, "delete error (second)"); + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (second)"); + if (ret) goto err; - } err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_delete_written(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -84,214 +80,193 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &k.k_i, 0)); - if (ret) { - bch_err_msg(c, ret, "update error"); + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) goto err; - } - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); bch2_journal_flush_all_pins(&c->journal); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); - if (ret) { - bch_err_msg(c, ret, "delete error"); + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error"); + if (ret) goto err; - } err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i++) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i; - k.k.p.snapshot = U32_MAX; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i; + ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, - NULL, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "insert error"); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) goto err; - } } pr_info("iterating forwards"); i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating forwards"); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) goto err; - } BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating backwards"); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) goto err; - } BUG_ON(i); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test extents"); for (i = 0; i < nr; i += 8) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i + 8; - k.k.p.snapshot = U32_MAX; - k.k.size = 8; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 8; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "insert error"); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) goto err; - } } pr_info("iterating forwards"); i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating forwards"); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) goto err; - } BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating backwards"); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) goto err; - } BUG_ON(i); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate_slots(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i++) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i * 2; - k.k.p.snapshot = U32_MAX; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i * 2; + ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, - NULL, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "insert error"); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) goto err; - } } pr_info("iterating forwards"); i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i += 2; 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating forwards"); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) goto err; - } BUG_ON(i != nr * 2); @@ -299,7 +274,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i >= nr * 2) @@ -317,45 +292,41 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) } ret = 0; err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i += 16) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i + 16; - k.k.p.snapshot = U32_MAX; - k.k.size = 8; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 16; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "insert error"); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) goto err; - } } pr_info("iterating forwards"); i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); @@ -363,10 +334,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i += 16; 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating forwards"); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) goto err; - } BUG_ON(i != nr); @@ -374,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i == nr) @@ -386,13 +356,12 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = k.k->p.offset; 0; })); - if (ret) { - bch_err_msg(c, ret, "error iterating forwards by slots"); + bch_err_msg(c, ret, "error iterating forwards by slots"); + if (ret) goto err; - } ret = 0; err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return 0; } @@ -402,43 +371,41 @@ err: */ static int test_peek_end(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return 0; } static int test_peek_end_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return 0; } @@ -458,10 +425,8 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.size = end - start; k.k_i.k.version.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); - if (ret) - bch_err_fn(c, ret); + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0); + bch_err_fn(c, ret); return ret; } @@ -515,10 +480,9 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, k.k_i.k.size = len; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i, + bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -538,7 +502,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) /* Test skipping over keys in unrelated snapshots: */ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_i_cookie cookie; @@ -546,20 +510,19 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); if (ret) return ret; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + trans = bch2_trans_get(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -572,13 +535,12 @@ static int test_snapshots(struct bch_fs *c, u64 nr) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); if (ret) return ret; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(&trans, U32_MAX, + bch2_snapshot_node_create(trans, U32_MAX, snapids, snapid_subvols, 2)); @@ -589,12 +551,8 @@ static int test_snapshots(struct bch_fs *c, u64 nr) swap(snapids[0], snapids[1]); ret = test_snapshot_filter(c, snapids[0], snapids[1]); - if (ret) { - bch_err_msg(c, ret, "from test_snapshot_filter"); - return ret; - } - - return 0; + bch_err_msg(c, ret, "from test_snapshot_filter"); + return ret; } /* perf tests */ @@ -609,38 +567,34 @@ static u64 test_rand(void) static int rand_insert(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_cookie k; int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); k.k.p.snapshot = U32_MAX; - ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int rand_insert_multi(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_cookie k[8]; int ret = 0; unsigned j; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { for (j = 0; j < ARRAY_SIZE(k); j++) { bkey_cookie_init(&k[j].k_i); @@ -648,46 +602,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) k[j].k.p.snapshot = U32_MAX; } - ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int rand_lookup(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ret = bkey_err(k); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -703,8 +656,7 @@ static int rand_mixed_trans(struct btree_trans *trans, k = bch2_btree_iter_peek(iter); ret = bkey_err(k); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(trans->c, ret, "lookup error"); + bch_err_msg(trans->c, ret, "lookup error"); if (ret) return ret; @@ -719,26 +671,25 @@ static int rand_mixed_trans(struct btree_trans *trans, static int rand_mixed(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie cookie; int ret = 0; u64 i, rand; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { rand = test_rand(); - ret = commit_do(&trans, NULL, NULL, 0, - rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + ret = commit_do(trans, NULL, NULL, 0, + rand_mixed_trans(trans, &iter, &cookie, i, rand)); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -766,22 +717,20 @@ err: static int rand_delete(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i++) { struct bpos pos = SPOS(0, test_rand(), U32_MAX); - ret = commit_do(&trans, NULL, NULL, 0, - __do_delete(&trans, pos)); + ret = commit_do(trans, NULL, NULL, 0, + __do_delete(trans, pos)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -794,14 +743,14 @@ static int seq_insert(struct bch_fs *c, u64 nr) bkey_cookie_init(&insert.k_i); return bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; insert.k.p = iter.pos; - bch2_trans_update(&trans, &iter, &insert.k_i, 0); + bch2_trans_update(trans, &iter, &insert.k_i, 0); }))); } @@ -811,7 +760,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) struct bkey_s_c k; return bch2_trans_run(c, - for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -823,14 +772,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) struct bkey_s_c k; return bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_INTENT, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, &iter, &u.k_i, 0); + bch2_trans_update(trans, &iter, &u.k_i, 0); }))); } diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c index d294b3d..dc48b52 100644 --- a/libbcachefs/trace.c +++ b/libbcachefs/trace.c @@ -7,10 +7,11 @@ #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" +#include "move_types.h" #include "opts.h" +#include "six.h" #include -#include #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index a743ab4..09a5303 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("%d,%d %u %s %llu:%llu:%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->level, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); @@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote, TP_ARGS(bio) ); +TRACE_EVENT(read_nopromote, + TP_PROTO(struct bch_fs *c, int ret), + TP_ARGS(c, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, ret, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + + TP_printk("%d,%d ret %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ret) +); + DEFINE_EVENT(bio, read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) @@ -177,10 +196,9 @@ DEFINE_EVENT(bio, journal_write, TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, - u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), @@ -190,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start, __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) - __field(u64, prereserved ) - __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) @@ -204,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start, __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; - __entry->prereserved = prereserved; - __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, - __entry->prereserved, - __entry->prereserved_total, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, @@ -403,37 +415,55 @@ TRACE_EVENT(btree_path_relock_fail, __field(u8, level ) TRACE_BPOS_entries(pos) __array(char, node, 24 ) + __field(u8, self_read_count ) + __field(u8, self_intent_count) + __field(u8, read_count ) + __field(u8, intent_count ) __field(u32, iter_lock_seq ) __field(u32, node_lock_seq ) ), TP_fast_assign( struct btree *b = btree_path_node(path, level); + struct six_lock_count c; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - if (IS_ERR(b)) + + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + + if (IS_ERR(b)) { strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); - else + } else { + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_intent]; scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + } __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) ? six_lock_seq(&path->l[level].b->c.lock) : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->level, __entry->node, + __entry->self_read_count, + __entry->self_intent_count, + __entry->read_count, + __entry->intent_count, __entry->iter_lock_seq, __entry->node_lock_seq) ); @@ -475,7 +505,7 @@ TRACE_EVENT(btree_path_upgrade_fail, __entry->self_intent_count = c.n[SIX_LOCK_intent]; c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_intent]; __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) ? six_lock_seq(&path->l[level].b->c.lock) @@ -485,7 +515,7 @@ TRACE_EVENT(btree_path_upgrade_fail, TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, @@ -730,25 +760,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, ); TRACE_EVENT(move_data, - TP_PROTO(struct bch_fs *c, u64 sectors_moved, - u64 keys_moved), - TP_ARGS(c, sectors_moved, keys_moved), + TP_PROTO(struct bch_fs *c, + struct bch_move_stats *stats), + TP_ARGS(c, stats), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, sectors_moved ) + __field(dev_t, dev ) __field(u64, keys_moved ) + __field(u64, keys_raced ) + __field(u64, sectors_seen ) + __field(u64, sectors_moved ) + __field(u64, sectors_raced ) ), TP_fast_assign( - __entry->dev = c->dev; - __entry->sectors_moved = sectors_moved; - __entry->keys_moved = keys_moved; + __entry->dev = c->dev; + __entry->keys_moved = atomic64_read(&stats->keys_moved); + __entry->keys_raced = atomic64_read(&stats->keys_raced); + __entry->sectors_seen = atomic64_read(&stats->sectors_seen); + __entry->sectors_moved = atomic64_read(&stats->sectors_moved); + __entry->sectors_raced = atomic64_read(&stats->sectors_raced); ), - TP_printk("%d,%d sectors_moved %llu keys_moved %llu", + TP_printk("%d,%d keys moved %llu raced %llu" + "sectors seen %llu moved %llu raced %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->keys_moved) + __entry->keys_moved, + __entry->keys_raced, + __entry->sectors_seen, + __entry->sectors_moved, + __entry->sectors_raced) ); TRACE_EVENT(evacuate_bucket, @@ -975,7 +1016,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_printk("%s %pS btree %s pos %llu:%llu:%u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) @@ -995,13 +1036,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); +struct get_locks_fail; + TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned old_locks_want, - unsigned new_locks_want), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), + unsigned new_locks_want, + struct get_locks_fail *f), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), TP_STRUCT__entry( __array(char, trans_fn, 32 ) @@ -1009,6 +1053,11 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, btree_id ) __field(u8, old_locks_want ) __field(u8, new_locks_want ) + __field(u8, level ) + __field(u32, path_seq ) + __field(u32, node_seq ) + __field(u32, path_alloc_seq ) + __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1018,18 +1067,28 @@ TRACE_EVENT(trans_restart_upgrade, __entry->btree_id = path->btree_id; __entry->old_locks_want = old_locks_want; __entry->new_locks_want = new_locks_want; + __entry->level = f->l; + __entry->path_seq = path->l[f->l].lock_seq; + __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; + __entry->path_alloc_seq = path->alloc_seq; + __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->old_locks_want, - __entry->new_locks_want) + __entry->new_locks_want, + __entry->level, + __entry->path_seq, + __entry->node_seq, + __entry->path_alloc_seq, + __entry->downgrade_seq) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, @@ -1182,7 +1241,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, @@ -1190,6 +1249,42 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +TRACE_EVENT(path_downgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned, old_locks_want ) + __field(unsigned, new_locks_want ) + __field(unsigned, btree ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = path->locks_want; + __entry->btree = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->old_locks_want, + __entry->new_locks_want, + bch2_btree_id_str(__entry->btree), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), diff --git a/libbcachefs/util.c b/libbcachefs/util.c index ae4f6de..c3303b0 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -112,10 +112,10 @@ got_unit: #define parse_or_ret(cp, _f) \ do { \ - int ret = _f; \ - if (ret < 0) \ - return ret; \ - cp += ret; \ + int _ret = _f; \ + if (_ret < 0) \ + return _ret; \ + cp += _ret; \ } while (0) static int __bch2_strtou64_h(const char *cp, u64 *res) @@ -216,6 +216,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) while ((p = strsep(&s, ","))) { int flag = match_string(list, -1, p); + if (flag < 0) { ret = -1; break; @@ -268,6 +269,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) { +#ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; int ret = 0; @@ -288,6 +290,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) up_read(&task->signal->exec_update_lock); return ret; +#else + return 0; +#endif } void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) @@ -310,6 +315,58 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) return ret; } +#ifndef __KERNEL__ +#include +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + strim(buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + /* time stats: */ #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -354,6 +411,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; bch2_quantiles_update(&stats->quantiles, duration); } @@ -367,20 +425,24 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, } } +static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + for (struct bch2_time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, struct bch2_time_stat_buffer *b) { - struct bch2_time_stat_buffer_entry *i; unsigned long flags; spin_lock_irqsave(&stats->lock, flags); - for (i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); + __bch2_time_stats_clear_buffer(stats, b); spin_unlock_irqrestore(&stats->lock, flags); - - b->nr = 0; } void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) @@ -418,40 +480,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) preempt_enable(); } } -#endif - -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = pick_time_units(ns); - - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -} static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { @@ -462,8 +490,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } -#define TABSTOP_SIZE 12 - static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { prt_str(out, name); @@ -472,12 +498,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 prt_newline(out); } +#define TABSTOP_SIZE 12 + void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + /* * avoid divide by zero */ @@ -523,6 +561,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); + pr_name_and_units(out, "total:", stats->total_duration); prt_printf(out, "mean:"); prt_tab(out); @@ -580,6 +619,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats last_q = q; } } +#else +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} +#endif void bch2_time_stats_exit(struct bch2_time_stats *stats) { @@ -600,11 +642,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) /** * bch2_ratelimit_delay() - return how long to delay until the next time to do - * some work - * - * @d - the struct bch_ratelimit to update - * - * Returns the amount of time to delay by, in jiffies + * some work + * @d: the struct bch_ratelimit to update + * Returns: the amount of time to delay by, in jiffies */ u64 bch2_ratelimit_delay(struct bch_ratelimit *d) { @@ -617,9 +657,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d) /** * bch2_ratelimit_increment() - increment @d by the amount of work done - * - * @d - the struct bch_ratelimit to update - * @done - the amount of work done, in arbitrary units + * @d: the struct bch_ratelimit to update + * @done: the amount of work done, in arbitrary units */ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) { @@ -756,10 +795,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size) } } -int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) +int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { - struct page *page = alloc_pages_noprof(gfp_mask, 0); + struct page *page = alloc_pages(gfp_mask, 0); unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) @@ -797,9 +836,10 @@ void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) struct bvec_iter iter; __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_atomic(bv.bv_page); + void *dstp = kmap_local_page(bv.bv_page); + memcpy(dstp + bv.bv_offset, src, bv.bv_len); - kunmap_atomic(dstp); + kunmap_local(dstp); src += bv.bv_len; } @@ -811,9 +851,10 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) struct bvec_iter iter; __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_atomic(bv.bv_page); + void *srcp = kmap_local_page(bv.bv_page); + memcpy(dst, srcp + bv.bv_offset, bv.bv_len); - kunmap_atomic(srcp); + kunmap_local(srcp); dst += bv.bv_len; } diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 5fa29da..54e309d 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -60,13 +60,12 @@ static inline void vpfree(void *p, size_t size) free_pages((unsigned long) p, get_order(size)); } -static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) +static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { - return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc_noprof(size, gfp_mask); + return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc(size, gfp_mask); } -#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) static inline void kvpfree(void *p, size_t size) { @@ -76,13 +75,12 @@ static inline void kvpfree(void *p, size_t size) vpfree(p, size); } -static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) +static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) { return size < PAGE_SIZE - ? kmalloc_noprof(size, gfp_mask) - : vpmalloc_noprof(size, gfp_mask); + ? kmalloc(size, gfp_mask) + : vpmalloc(size, gfp_mask); } -#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); @@ -246,26 +244,7 @@ do { \ #define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) void bch2_pr_time_units(struct printbuf *, u64); - -#ifdef __KERNEL__ -static inline void pr_time(struct printbuf *out, u64 time) -{ - prt_printf(out, "%llu", time); -} -#else -#include -static inline void pr_time(struct printbuf *out, u64 _time) -{ - char time_str[64]; - time_t time = _time; - struct tm *tm = localtime(&time); - size_t err = strftime(time_str, sizeof(time_str), "%c", tm); - if (!err) - prt_printf(out, "(formatting error)"); - else - prt_printf(out, "%s", time_str); -} -#endif +void bch2_prt_datetime(struct printbuf *, time64_t); #ifdef __KERNEL__ static inline void uuid_unparse_lower(u8 *uuid, char *out) @@ -393,8 +372,9 @@ struct bch2_time_stat_buffer { struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ - u64 max_duration; u64 min_duration; + u64 max_duration; + u64 total_duration; u64 max_freq; u64 min_freq; u64 last_event; @@ -409,15 +389,39 @@ struct bch2_time_stats { #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -#endif static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + if (v != !!*start) { + if (!v) { + bch2_time_stats_update(stats, *start); + *start = 0; + } else { + *start = local_clock() ?: 1; + return true; + } + } + + return false; +} +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + bool ret = v && !*start; + *start = v; + return ret; +} +#endif + void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); @@ -468,8 +472,10 @@ struct bch_pd_controller { s64 last_change; s64 last_target; - /* If true, the rate will not increase if bch2_ratelimit_delay() - * is not being called often enough. */ + /* + * If true, the rate will not increase if bch2_ratelimit_delay() + * is not being called often enough. + */ bool backpressure; }; @@ -532,9 +538,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); -#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ - alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) +int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); static inline sector_t bdev_sectors(struct block_device *bdev) { @@ -607,6 +611,7 @@ static inline void __memcpy_u64s(void *dst, const void *src, { #ifdef CONFIG_X86_64 long d0, d1, d2; + asm volatile("rep ; movsq" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (u64s), "1" (dst), "2" (src) @@ -683,6 +688,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, #ifdef CONFIG_X86_64 long d0, d1, d2; + asm volatile("std ;\n" "rep ; movsq\n" "cld ;\n" @@ -775,12 +781,12 @@ static inline void __move_gap(void *array, size_t element_size, #define bubble_sort(_base, _nr, _cmp) \ do { \ - ssize_t _i, _end; \ + ssize_t _i, _last; \ bool _swapped = true; \ \ - for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ + for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ _swapped = false; \ - for (_i = 0; _i < _end; _i++) \ + for (_i = 0; _i < _last; _i++) \ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ swap((_base)[_i], (_base)[_i + 1]); \ _swapped = true; \ @@ -841,6 +847,11 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } +static inline int cmp_le32(__le32 l, __le32 r) +{ + return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); +} + #include #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index ef030fc..cb4f33e 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -13,10 +13,9 @@ /** * bch2_varint_encode - encode a variable length integer - * @out - destination to encode to - * @v - unsigned integer to encode - * - * Returns the size in bytes of the encoded integer - at most 9 bytes + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes */ int bch2_varint_encode(u8 *out, u64 v) { @@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v) /** * bch2_varint_decode - encode a variable length integer - * @in - varint to decode - * @end - end of buffer to decode from - * @out - on success, decoded integer - * - * Returns the size in bytes of the decoded integer - or -1 on failure (would + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would * have read past the end of the buffer) */ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) @@ -59,6 +57,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) if (likely(bytes < 9)) { __le64 v_le = 0; + memcpy(&v_le, in, bytes); v = le64_to_cpu(v_le); v >>= bytes; @@ -72,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) /** * bch2_varint_encode_fast - fast version of bch2_varint_encode + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes * * This version assumes it's always safe to write 8 bytes to @out, even if the * encoded integer would be smaller. @@ -95,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v) /** * bch2_varint_decode_fast - fast version of bch2_varint_decode + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) * * This version assumes that it is safe to read at most 8 bytes past the end of * @end (we still return an error if the varint extends past @end). diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h index 53a694d..a6561b4 100644 --- a/libbcachefs/vstructs.h +++ b/libbcachefs/vstructs.h @@ -41,11 +41,11 @@ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) #define vstruct_next(_s) \ - ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) + ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) + ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_end(_s) \ - ((void *) ((_s)->_data + __vstruct_u64s(_s))) + ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_for_each(_s, _i) \ for (_i = (_s)->start; \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 70f7800..79d9826 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "acl.h" #include "bkey_methods.h" #include "btree_update.h" #include "extents.h" @@ -69,46 +70,38 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - const struct xattr_handler *handler; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len)); + int ret = 0; - if (bkey_val_u64s(k.k) < - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len))) { - prt_printf(err, "value too small (%zu < %u)", - bkey_val_u64s(k.k), - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len))); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, + xattr_val_size_too_small, + "value too small (%zu < %u)", + bkey_val_u64s(k.k), val_u64s); /* XXX why +4 ? */ - if (bkey_val_u64s(k.k) > - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4)) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4)); - return -BCH_ERR_invalid_bkey; - } - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (!handler) { - prt_printf(err, "invalid type (%u)", xattr.v->x_type); - return -BCH_ERR_invalid_bkey; - } - - if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { - prt_printf(err, "xattr name has invalid characters"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4); + + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, + xattr_val_size_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), val_u64s); + + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, + xattr_invalid_type, + "invalid type (%u)", xattr.v->x_type); + + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, + xattr_name_invalid_chars, + "xattr name has invalid characters"); +fsck_err: + return ret; } void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, @@ -130,6 +123,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, xattr.v->x_name, le16_to_cpu(xattr.v->x_val_len), (char *) xattr_val(xattr.v)); + + if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || + xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { + prt_char(out, ' '); + bch2_acl_to_text(out, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + } } static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, @@ -299,24 +299,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; u32 snapshot; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); iter = (struct btree_iter) { NULL }; - ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, SPOS(inum, offset, snapshot), POS(inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_xattr) @@ -328,12 +326,12 @@ retry: } offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) goto out; @@ -358,7 +356,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); + bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); return bch2_err_class(ret); } @@ -373,18 +371,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct bch_inode_unpacked inode_u; - struct btree_trans trans; int ret; - bch2_trans_init(&trans, c, 0, 0); - - ret = commit_do(&trans, NULL, NULL, 0, - bch2_xattr_set(&trans, inode_inum(inode), &inode_u, + ret = bch2_trans_run(c, + commit_do(trans, NULL, NULL, 0, + bch2_xattr_set(trans, inode_inum(inode), &inode_u, &hash, name, value, size, - handler->flags, flags)); - if (!ret) - bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); - bch2_trans_exit(&trans); + handler->flags, flags)) ?: + (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); return bch2_err_class(ret); } @@ -494,7 +488,8 @@ struct inode_opt_set { bool defined; }; -static int inode_opt_set_fn(struct bch_inode_info *inode, +static int inode_opt_set_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -557,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, s.v = v + 1; s.defined = true; } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -565,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } + spin_unlock(&dentry->d_lock); s.defined = false; } @@ -587,7 +591,7 @@ err: if (value && (opt_id == Opt_background_compression || opt_id == Opt_background_target)) - bch2_rebalance_add_work(c, inode->v.i_blocks); + bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); } diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index f5a52e3..1337f31 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/linux/blkdev.c b/linux/blkdev.c index ea901a4..5cac6eb 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -162,7 +162,7 @@ sector_t get_capacity(struct gendisk *disk) return bytes >> 9; } -void blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, void *holder) { fdatasync(bdev->bd_fd); close(bdev->bd_sync_fd); @@ -170,25 +170,25 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) free(bdev); } -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop) { struct block_device *bdev; int fd, sync_fd, buffered_fd, flags = 0; - if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE)) + if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE)) flags = O_RDWR; - else if (mode & FMODE_READ) + else if (mode & BLK_OPEN_READ) flags = O_RDONLY; - else if (mode & FMODE_WRITE) + else if (mode & BLK_OPEN_WRITE) flags = O_WRONLY; - if (!(mode & FMODE_BUFFERED)) + if (!(mode & BLK_OPEN_BUFFERED)) flags |= O_DIRECT; #if 0 /* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */ - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) flags |= O_EXCL; #endif buffered_fd = open(path, flags & ~O_DIRECT); @@ -289,6 +289,8 @@ static void sync_write(struct bio *bio, struct iovec * iov, unsigned i) sync_check(bio, ret); } +static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed); + static int aio_completion_thread(void *arg) { struct io_event events[8], *ev; @@ -303,6 +305,8 @@ static int aio_completion_thread(void *arg) continue; if (ret < 0) die("io_getevents() error: %s", strerror(-ret)); + if (ret) + wake_up(&aio_events_completed); for (ev = events; ev < events + ret; ev++) { struct bio *bio = (struct bio *) ev->data; @@ -394,7 +398,10 @@ static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode) }, *iocbp = &iocb; atomic_inc(&running_requests); - ret = io_submit(aio_ctx, 1, &iocbp); + + wait_event(aio_events_completed, + (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);; + if (ret != 1) die("io_submit err: %s", strerror(-ret)); } diff --git a/linux/closure.c b/linux/closure.c index 0855e69..f86c9ee 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -21,6 +21,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); if (!r) { + smp_acquire__after_ctrl_dep(); + + cl->closure_get_happened = false; + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); @@ -43,7 +47,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) /* For clearing flags with the same atomic op as a put */ void closure_sub(struct closure *cl, int v) { - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); + closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining)); } EXPORT_SYMBOL(closure_sub); @@ -52,7 +56,7 @@ EXPORT_SYMBOL(closure_sub); */ void closure_put(struct closure *cl) { - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); + closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining)); } EXPORT_SYMBOL(closure_put); @@ -90,6 +94,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; + cl->closure_get_happened = true; closure_set_waiting(cl, _RET_IP_); atomic_add(CLOSURE_WAITING + 1, &cl->remaining); llist_add(&cl->list, &waitlist->list); diff --git a/linux/shrinker.c b/linux/shrinker.c index 0b5715b..dfefad4 100644 --- a/linux/shrinker.c +++ b/linux/shrinker.c @@ -131,7 +131,21 @@ void run_shrinkers(gfp_t gfp_mask, bool allocation_failed) static int shrinker_thread(void *arg) { while (!kthread_should_stop()) { - sleep(1); + struct timespec to; + int v; + + clock_gettime(CLOCK_MONOTONIC, &to); + to.tv_sec += 1; + __set_current_state(TASK_INTERRUPTIBLE); + errno = 0; + while ((v = READ_ONCE(current->state)) != TASK_RUNNING && + errno != ETIMEDOUT) + futex(¤t->state, FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG, + v, &to, NULL, (uint32_t)~0); + if (kthread_should_stop()) + break; + if (v != TASK_RUNNING) + __set_current_state(TASK_RUNNING); run_shrinkers(GFP_KERNEL, false); } diff --git a/linux/six.c b/linux/six.c deleted file mode 100644 index 0b9c4bb..0000000 --- a/linux/six.c +++ /dev/null @@ -1,893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) do {} while (0) -#endif - -#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -#define six_release(l, ip) lock_release(l, ip) - -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); - -#define SIX_LOCK_HELD_read_OFFSET 0 -#define SIX_LOCK_HELD_read ~(~0U << 26) -#define SIX_LOCK_HELD_intent (1U << 26) -#define SIX_LOCK_HELD_write (1U << 27) -#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) -#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -#define SIX_LOCK_NOSPIN (1U << 31) - -struct six_lock_vals { - /* Value we add to the lock in order to take the lock: */ - u32 lock_val; - - /* If the lock has this value (used as a mask), taking the lock fails: */ - u32 lock_fail; - - /* Mask that indicates lock is held for this type: */ - u32 held_mask; - - /* Waitlist we wakeup when releasing the lock: */ - enum six_lock_type unlock_wakeup; -}; - -static const struct six_lock_vals l[] = { - [SIX_LOCK_read] = { - .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, - .lock_fail = SIX_LOCK_HELD_write, - .held_mask = SIX_LOCK_HELD_read, - .unlock_wakeup = SIX_LOCK_write, - }, - [SIX_LOCK_intent] = { - .lock_val = SIX_LOCK_HELD_intent, - .lock_fail = SIX_LOCK_HELD_intent, - .held_mask = SIX_LOCK_HELD_intent, - .unlock_wakeup = SIX_LOCK_intent, - }, - [SIX_LOCK_write] = { - .lock_val = SIX_LOCK_HELD_write, - .lock_fail = SIX_LOCK_HELD_read, - .held_mask = SIX_LOCK_HELD_write, - .unlock_wakeup = SIX_LOCK_read, - }, -}; - -static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -{ - if ((atomic_read(&lock->state) & mask) != mask) - atomic_or(mask, &lock->state); -} - -static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -{ - if (atomic_read(&lock->state) & mask) - atomic_and(~mask, &lock->state); -} - -static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - u32 old, struct task_struct *owner) -{ - if (type != SIX_LOCK_intent) - return; - - if (!(old & SIX_LOCK_HELD_intent)) { - EBUG_ON(lock->owner); - lock->owner = owner; - } else { - EBUG_ON(lock->owner != current); - } -} - -static inline unsigned pcpu_read_count(struct six_lock *lock) -{ - unsigned read_count = 0; - int cpu; - - for_each_possible_cpu(cpu) - read_count += *per_cpu_ptr(lock->readers, cpu); - return read_count; -} - -/* - * __do_six_trylock() - main trylock routine - * - * Returns 1 on success, 0 on failure - * - * In percpu reader mode, a failed trylock may cause a spurious trylock failure - * for anoter thread taking the competing lock type, and we may havve to do a - * wakeup: when a wakeup is required, we return -1 - wakeup_type. - */ -static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, - struct task_struct *task, bool try) -{ - int ret; - u32 old; - - EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && - (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); - - /* - * Percpu reader mode: - * - * The basic idea behind this algorithm is that you can implement a lock - * between two threads without any atomics, just memory barriers: - * - * For two threads you'll need two variables, one variable for "thread a - * has the lock" and another for "thread b has the lock". - * - * To take the lock, a thread sets its variable indicating that it holds - * the lock, then issues a full memory barrier, then reads from the - * other thread's variable to check if the other thread thinks it has - * the lock. If we raced, we backoff and retry/sleep. - * - * Failure to take the lock may cause a spurious trylock failure in - * another thread, because we temporarily set the lock to indicate that - * we held it. This would be a problem for a thread in six_lock(), when - * they are calling trylock after adding themself to the waitlist and - * prior to sleeping. - * - * Therefore, if we fail to get the lock, and there were waiters of the - * type we conflict with, we will have to issue a wakeup. - * - * Since we may be called under wait_lock (and by the wakeup code - * itself), we return that the wakeup has to be done instead of doing it - * here. - */ - if (type == SIX_LOCK_read && lock->readers) { - preempt_disable(); - this_cpu_inc(*lock->readers); /* signal that we own lock */ - - smp_mb(); - - old = atomic_read(&lock->state); - ret = !(old & l[type].lock_fail); - - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); - - if (!ret && (old & SIX_LOCK_WAITING_write)) - ret = -1 - SIX_LOCK_write; - } else if (type == SIX_LOCK_write && lock->readers) { - if (try) { - atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } - - ret = !pcpu_read_count(lock); - - if (try && !ret) { - old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); - if (old & SIX_LOCK_WAITING_read) - ret = -1 - SIX_LOCK_read; - } - } else { - old = atomic_read(&lock->state); - do { - ret = !(old & l[type].lock_fail); - if (!ret || (type == SIX_LOCK_write && !try)) { - smp_mb(); - break; - } - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); - - EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); - } - - if (ret > 0) - six_set_owner(lock, type, old, task); - - EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && - (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); - - return ret; -} - -static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -{ - struct six_lock_waiter *w, *next; - struct task_struct *task; - bool saw_one; - int ret; -again: - ret = 0; - saw_one = false; - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, &lock->wait_list, list) { - if (w->lock_want != lock_type) - continue; - - if (saw_one && lock_type != SIX_LOCK_read) - goto unlock; - saw_one = true; - - ret = __do_six_trylock(lock, lock_type, w->task, false); - if (ret <= 0) - goto unlock; - - __list_del(w->list.prev, w->list.next); - task = w->task; - /* - * Do no writes to @w besides setting lock_acquired - otherwise - * we would need a memory barrier: - */ - barrier(); - w->lock_acquired = true; - wake_up_process(task); - } - - six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -unlock: - raw_spin_unlock(&lock->wait_lock); - - if (ret < 0) { - lock_type = -ret - 1; - goto again; - } -} - -__always_inline -static void six_lock_wakeup(struct six_lock *lock, u32 state, - enum six_lock_type lock_type) -{ - if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) - return; - - if (!(state & (SIX_LOCK_WAITING_read << lock_type))) - return; - - __six_lock_wakeup(lock, lock_type); -} - -__always_inline -static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -{ - int ret; - - ret = __do_six_trylock(lock, type, current, try); - if (ret < 0) - __six_lock_wakeup(lock, -ret - 1); - - return ret > 0; -} - -/** - * six_trylock_ip - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - if (!do_six_trylock(lock, type, true)) - return false; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); - return true; -} -EXPORT_SYMBOL_GPL(six_trylock_ip); - -/** - * six_relock_ip - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip) -{ - if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) - return false; - - if (six_lock_seq(lock) != seq) { - six_unlock_ip(lock, type, ip); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(six_relock_ip); - -#ifdef CONFIG_LOCK_SPIN_ON_OWNER - -static inline bool six_can_spin_on_owner(struct six_lock *lock) -{ - struct task_struct *owner; - bool ret; - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(lock->owner); - ret = !owner || owner_on_cpu(owner); - rcu_read_unlock(); - - return ret; -} - -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner, - u64 end_time) -{ - bool ret = true; - unsigned loop = 0; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner_on_cpu(owner) || need_resched()) { - ret = false; - break; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; - u64 end_time; - - if (type == SIX_LOCK_write) - return false; - - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; - - if (!osq_lock(&lock->osq)) - goto fail; - - end_time = sched_clock() + 10 * NSEC_PER_USEC; - - while (1) { - struct task_struct *owner; - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner, end_time)) - break; - - if (do_six_trylock(lock, type, false)) { - osq_unlock(&lock->osq); - preempt_enable(); - return true; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - - osq_unlock(&lock->osq); -fail: - preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - - return false; -} - -#else /* CONFIG_LOCK_SPIN_ON_OWNER */ - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - return false; -} - -#endif - -noinline -static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret = 0; - - if (type == SIX_LOCK_write) { - EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } - - trace_contention_begin(lock, 0); - lock_contended(&lock->dep_map, ip); - - if (six_optimistic_spin(lock, type)) - goto out; - - wait->task = current; - wait->lock_want = type; - wait->lock_acquired = false; - - raw_spin_lock(&lock->wait_lock); - six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); - /* - * Retry taking the lock after taking waitlist lock, in case we raced - * with an unlock: - */ - ret = __do_six_trylock(lock, type, current, false); - if (ret <= 0) { - wait->start_time = local_clock(); - - if (!list_empty(&lock->wait_list)) { - struct six_lock_waiter *last = - list_last_entry(&lock->wait_list, - struct six_lock_waiter, list); - - if (time_before_eq64(wait->start_time, last->start_time)) - wait->start_time = last->start_time + 1; - } - - list_add_tail(&wait->list, &lock->wait_list); - } - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(ret > 0)) { - ret = 0; - goto out; - } - - if (unlikely(ret < 0)) { - __six_lock_wakeup(lock, -ret - 1); - ret = 0; - } - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (wait->lock_acquired) - break; - - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (unlikely(ret)) { - raw_spin_lock(&lock->wait_lock); - if (!wait->lock_acquired) - list_del(&wait->list); - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(wait->lock_acquired)) - do_six_unlock_type(lock, type); - break; - } - - schedule(); - } - - __set_current_state(TASK_RUNNING); -out: - if (ret && type == SIX_LOCK_write) { - six_clear_bitmask(lock, SIX_LOCK_HELD_write); - six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); - } - trace_contention_end(lock, 0); - - return ret; -} - -/** - * six_lock_ip_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * This is the most general six_lock() variant, with parameters to support full - * cycle detection for deadlock avoidance. - * - * The code calling this function must implement tracking of held locks, and the - * @wait object should be embedded into the struct that tracks held locks - - * which must also be accessible in a thread-safe way. - * - * @should_sleep_fn should invoke the cycle detector; it should walk each - * lock's waiters, and for each waiter recursively walk their held locks. - * - * When this function must block, @wait will be added to @lock's waitlist before - * calling trylock, and before calling @should_sleep_fn, and @wait will not be - * removed from the lock waitlist until the lock has been successfully acquired, - * or we abort. - * - * @wait.start_time will be monotonically increasing for any given waitlist, and - * thus may be used as a loop cursor. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret; - - wait->start_time = 0; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - - ret = do_six_trylock(lock, type, true) ? 0 - : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); - - if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - if (!ret) - lock_acquired(&lock->dep_map, ip); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_ip_waiter); - -__always_inline -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - u32 state; - - if (type == SIX_LOCK_intent) - lock->owner = NULL; - - if (type == SIX_LOCK_read && - lock->readers) { - smp_mb(); /* unlock barrier */ - this_cpu_dec(*lock->readers); - smp_mb(); /* between unlocking and checking for waiters */ - state = atomic_read(&lock->state); - } else { - u32 v = l[type].lock_val; - - if (type != SIX_LOCK_read) - v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; - - EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); - state = atomic_sub_return_release(v, &lock->state); - } - - six_lock_wakeup(lock, state, l[type].unlock_wakeup); -} - -/** - * six_unlock_ip - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - EBUG_ON(type == SIX_LOCK_write && - !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - EBUG_ON((type == SIX_LOCK_write || - type == SIX_LOCK_intent) && - lock->owner != current); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - else - lock->seq++; - - if (type == SIX_LOCK_intent && - lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - - do_six_unlock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_unlock_ip); - -/** - * six_lock_downgrade - convert an intent lock to a read lock - * @lock: lock to dowgrade - * - * @lock will have read count incremented and intent count decremented - */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} -EXPORT_SYMBOL_GPL(six_lock_downgrade); - -/** - * six_lock_tryupgrade - attempt to convert read lock to an intent lock - * @lock: lock to upgrade - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_lock_tryupgrade(struct six_lock *lock) -{ - u32 old = atomic_read(&lock->state), new; - - do { - new = old; - - if (new & SIX_LOCK_HELD_intent) - return false; - - if (!lock->readers) { - EBUG_ON(!(new & SIX_LOCK_HELD_read)); - new -= l[SIX_LOCK_read].lock_val; - } - - new |= SIX_LOCK_HELD_intent; - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); - - if (lock->readers) - this_cpu_dec(*lock->readers); - - six_set_owner(lock, SIX_LOCK_intent, old, current); - - return true; -} -EXPORT_SYMBOL_GPL(six_lock_tryupgrade); - -/** - * six_trylock_convert - attempt to convert a held lock from one type to another - * @lock: lock to upgrade - * @from: SIX_LOCK_read or SIX_LOCK_intent - * @to: SIX_LOCK_read or SIX_LOCK_intent - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) -{ - EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); - - if (to == from) - return true; - - if (to == SIX_LOCK_read) { - six_lock_downgrade(lock); - return true; - } else { - return six_lock_tryupgrade(lock); - } -} -EXPORT_SYMBOL_GPL(six_trylock_convert); - -/** - * six_lock_increment - increase held lock count on a lock that is already held - * @lock: lock to increment - * @type: SIX_LOCK_read or SIX_LOCK_intent - * - * @lock must already be held, with a lock type that is greater than or equal to - * @type - * - * A corresponding six_unlock_type() call will be required for @lock to be fully - * unlocked. - */ -void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -{ - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); - - /* XXX: assert already locked, and that we don't overflow: */ - - switch (type) { - case SIX_LOCK_read: - if (lock->readers) { - this_cpu_inc(*lock->readers); - } else { - EBUG_ON(!(atomic_read(&lock->state) & - (SIX_LOCK_HELD_read| - SIX_LOCK_HELD_intent))); - atomic_add(l[type].lock_val, &lock->state); - } - break; - case SIX_LOCK_intent: - EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - lock->intent_lock_recurse++; - break; - case SIX_LOCK_write: - BUG(); - break; - } -} -EXPORT_SYMBOL_GPL(six_lock_increment); - -/** - * six_lock_wakeup_all - wake up all waiters on @lock - * @lock: lock to wake up waiters for - * - * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then - * abort the lock operation. - * - * This function is never needed in a bug-free program; it's only useful in - * debug code, e.g. to determine if a cycle detector is at fault. - */ -void six_lock_wakeup_all(struct six_lock *lock) -{ - u32 state = atomic_read(&lock->state); - struct six_lock_waiter *w; - - six_lock_wakeup(lock, state, SIX_LOCK_read); - six_lock_wakeup(lock, state, SIX_LOCK_intent); - six_lock_wakeup(lock, state, SIX_LOCK_write); - - raw_spin_lock(&lock->wait_lock); - list_for_each_entry(w, &lock->wait_list, list) - wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); -} -EXPORT_SYMBOL_GPL(six_lock_wakeup_all); - -/** - * six_lock_counts - return held lock counts, for each lock type - * @lock: lock to return counters for - * - * Return: the number of times a lock is held for read, intent and write. - */ -struct six_lock_count six_lock_counts(struct six_lock *lock) -{ - struct six_lock_count ret; - - ret.n[SIX_LOCK_read] = !lock->readers - ? atomic_read(&lock->state) & SIX_LOCK_HELD_read - : pcpu_read_count(lock); - ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + - lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_counts); - -/** - * six_lock_readers_add - directly manipulate reader count of a lock - * @lock: lock to add/subtract readers for - * @nr: reader count to add/subtract - * - * When an upper layer is implementing lock reentrency, we may have both read - * and intent locks on the same lock. - * - * When we need to take a write lock, the read locks will cause self-deadlock, - * because six locks themselves do not track which read locks are held by the - * current thread and which are held by a different thread - it does no - * per-thread tracking of held locks. - * - * The upper layer that is tracking held locks may however, if trylock() has - * failed, count up its own read locks, subtract them, take the write lock, and - * then re-add them. - * - * As in any other situation when taking a write lock, @lock must be held for - * intent one (or more) times, so @lock will never be left unlocked. - */ -void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (lock->readers) { - this_cpu_add(*lock->readers, nr); - } else { - EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); - /* reader count starts at bit 0 */ - atomic_add(nr, &lock->state); - } -} -EXPORT_SYMBOL_GPL(six_lock_readers_add); - -/** - * six_lock_exit - release resources held by a lock prior to freeing - * @lock: lock to exit - * - * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is - * required to free the percpu read counts. - */ -void six_lock_exit(struct six_lock *lock) -{ - WARN_ON(lock->readers && pcpu_read_count(lock)); - WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); - - free_percpu(lock->readers); - lock->readers = NULL; -} -EXPORT_SYMBOL_GPL(six_lock_exit); - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) -{ - atomic_set(&lock->state, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - - /* - * Don't assume that we have real percpu variables available in - * userspace: - */ -#ifdef __KERNEL__ - if (flags & SIX_LOCK_INIT_PCPU) { - /* - * We don't return an error here on memory allocation failure - * since percpu is an optimization, and locks will work with the - * same semantics in non-percpu mode: callers can check for - * failure if they wish by checking lock->readers, but generally - * will not want to treat it as an error. - */ - lock->readers = alloc_percpu(unsigned); - } -#endif -} -EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/mkfs.bcachefs b/mkfs.bcachefs deleted file mode 100755 index b3631ba..0000000 --- a/mkfs.bcachefs +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -SDIR="$(readlink -f "$0")" -exec "${SDIR%/*}/bcachefs" format "$@" diff --git a/mount.bcachefs b/mount.bcachefs deleted file mode 100755 index 5900232..0000000 --- a/mount.bcachefs +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -SDIR="$(readlink -f "$0")" -exec "${SDIR%/*}/bcachefs" mount "$@" diff --git a/packaging/bcachefs-tools.spec b/packaging/bcachefs-tools.spec index a8f023a..24d93de 100644 --- a/packaging/bcachefs-tools.spec +++ b/packaging/bcachefs-tools.spec @@ -45,10 +45,13 @@ rm -f %{buildroot}/%{_datadir}/initramfs-tools/scripts/local-premount/bcachefs rm -f %{buildroot}/usr/lib/libbcachefs.so %files -%{_sbindir}/mount.bcachefs %{_sbindir}/bcachefs +%{_sbindir}/mount.bcachefs %{_sbindir}/fsck.bcachefs %{_sbindir}/mkfs.bcachefs +%{_sbindir}/mount.fuse.bcachefs +%{_sbindir}/fsck.fuse.bcachefs +%{_sbindir}/mkfs.fuse.bcachefs %{_mandir}/man8/bcachefs.8.gz %changelog diff --git a/qcow2.c b/qcow2.c index d01fa94..3b75fc1 100644 --- a/qcow2.c +++ b/qcow2.c @@ -72,7 +72,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset) void qcow2_write_image(int infd, int outfd, ranges *data, unsigned block_size) { - u64 image_size = get_size(NULL, infd); + u64 image_size = get_size(infd); unsigned l2_size = block_size / sizeof(u64); unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size); struct qcow2_hdr hdr = { 0 }; diff --git a/rust-src/Cargo.lock b/rust-src/Cargo.lock index c4dd7f5..d270cb4 100644 --- a/rust-src/Cargo.lock +++ b/rust-src/Cargo.lock @@ -4,27 +4,67 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] [[package]] -name = "android_system_properties" -version = "0.1.5" +name = "anstream" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ - "libc", + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" + +[[package]] +name = "anstyle-parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c" +dependencies = [ + "anstyle", + "windows-sys", ] [[package]] name = "anyhow" -version = "1.0.68" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "atty" @@ -53,9 +93,10 @@ dependencies = [ "byteorder", "chrono", "clap", + "clap_complete", "colored", "either", - "errno", + "errno 0.2.8", "gag", "getset", "itertools", @@ -74,7 +115,7 @@ dependencies = [ "anyhow", "bindgen", "bitfield", - "bitflags", + "bitflags 1.3.2", "byteorder", "chrono", "colored", @@ -92,7 +133,7 @@ name = "bindgen" version = "0.64.0" source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cexpr", "clang-sys", "lazy_static", @@ -103,7 +144,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn", + "syn 1.0.109", ] [[package]] @@ -119,22 +160,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "bumpalo" -version = "3.12.0" +name = "bitflags" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] [[package]] name = "cexpr" @@ -153,24 +197,18 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ - "iana-time-zone", - "js-sys", - "num-integer", "num-traits", - "time", - "wasm-bindgen", - "winapi", ] [[package]] name = "clang-sys" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" dependencies = [ "glob", "libc", @@ -178,118 +216,77 @@ dependencies = [ [[package]] name = "clap" -version = "4.1.4" +version = "4.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" +checksum = "fb690e81c7840c0d7aade59f242ea3b41b9bc27bcd5997890e7702ae4b32e487" dependencies = [ - "bitflags", + "clap_builder", "clap_derive", - "clap_lex", - "is-terminal", "once_cell", - "strsim", - "termcolor", - "terminal_size", ] [[package]] -name = "clap_derive" -version = "4.1.0" +name = "clap_builder" +version = "4.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" +checksum = "5ed2e96bc16d8d740f6f48d663eddf4b8a0983e79210fd55479b7bcd0a69860e" dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "clap_lex" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" -dependencies = [ - "os_str_bytes", -] - -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", ] [[package]] -name = "colored" -version = "2.0.0" +name = "clap_complete" +version = "4.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" +checksum = "bffe91f06a11b4b9420f62103854e90867812cd5d01557f853c5ee8e791b12ae" dependencies = [ - "atty", - "lazy_static", - "winapi", + "clap", ] [[package]] -name = "core-foundation-sys" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" - -[[package]] -name = "cxx" -version = "1.0.89" +name = "clap_derive" +version = "4.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050" dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", + "heck", + "proc-macro2", + "quote", + "syn 2.0.38", ] [[package]] -name = "cxx-build" -version = "1.0.89" +name = "clap_lex" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn", -] +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" [[package]] -name = "cxxbridge-flags" -version = "1.0.89" +name = "colorchoice" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] -name = "cxxbridge-macro" -version = "1.0.89" +name = "colored" +version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" dependencies = [ - "proc-macro2", - "quote", - "syn", + "is-terminal", + "lazy_static", + "windows-sys", ] [[package]] name = "either" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "errno" @@ -302,6 +299,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "errno" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "errno-dragonfly" version = "0.1.2" @@ -314,12 +321,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "1.8.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" -dependencies = [ - "instant", -] +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "filedescriptor" @@ -351,7 +355,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -377,65 +381,29 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] - -[[package]] -name = "iana-time-zone" -version = "0.1.53" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "winapi", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" -dependencies = [ - "cxx", - "cxx-build", -] - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "io-lifetimes" -version = "1.0.4" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ + "hermit-abi 0.3.3", "libc", "windows-sys", ] [[package]] name = "is-terminal" -version = "0.4.2" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ - "hermit-abi 0.2.6", - "io-lifetimes", - "rustix", + "hermit-abi 0.3.3", + "rustix 0.38.21", "windows-sys", ] @@ -448,15 +416,6 @@ dependencies = [ "either", ] -[[package]] -name = "js-sys" -version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" -dependencies = [ - "wasm-bindgen", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -471,9 +430,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libudev-sys" @@ -486,34 +445,28 @@ dependencies = [ ] [[package]] -name = "link-cplusplus" -version = "1.0.8" +name = "linux-raw-sys" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" -dependencies = [ - "cc", -] +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "log" -version = "0.4.17" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memoffset" @@ -540,36 +493,20 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "num-integer" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" -dependencies = [ - "autocfg", - "num-traits", -] - [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "once_cell" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" - -[[package]] -name = "os_str_bytes" -version = "6.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "parse-display" @@ -592,15 +529,15 @@ dependencies = [ "proc-macro2", "quote", "regex", - "regex-syntax", - "syn", + "regex-syntax 0.6.29", + "syn 1.0.109", ] [[package]] name = "paste" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "peeking_take_while" @@ -610,9 +547,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "proc-macro-error" @@ -623,7 +560,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "version_check", ] @@ -640,56 +577,65 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "redox_syscall" -version = "0.2.16" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.7.1" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.2", ] [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] -name = "remove_dir_all" -version = "0.5.3" +name = "regex-syntax" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi", -] +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rpassword" @@ -709,29 +655,36 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.36.7" +version = "0.37.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" +checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2" dependencies = [ - "bitflags", - "errno", + "bitflags 1.3.2", + "errno 0.3.5", "io-lifetimes", "libc", - "linux-raw-sys", + "linux-raw-sys 0.3.8", "windows-sys", ] [[package]] -name = "scratch" -version = "1.0.3" +name = "rustix" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" +dependencies = [ + "bitflags 2.4.1", + "errno 0.3.5", + "libc", + "linux-raw-sys 0.4.10", + "windows-sys", +] [[package]] name = "shlex" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" [[package]] name = "strsim" @@ -741,9 +694,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.107" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", @@ -751,67 +704,57 @@ dependencies = [ ] [[package]] -name = "tempfile" -version = "3.3.0" +name = "syn" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ - "cfg-if", - "fastrand", - "libc", - "redox_syscall", - "remove_dir_all", - "winapi", + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] -name = "termcolor" -version = "1.2.0" +name = "tempfile" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ - "winapi-util", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix 0.38.21", + "windows-sys", ] [[package]] name = "terminal_size" -version = "0.2.3" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb20089a8ba2b69debd491f8d2d023761cbf196e999218c591fa1e7e15a21907" +checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" dependencies = [ - "rustix", + "rustix 0.37.27", "windows-sys", ] [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", - "syn", -] - -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi", - "winapi", + "syn 2.0.38", ] [[package]] @@ -827,21 +770,21 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] -name = "unicode-width" -version = "0.1.10" +name = "utf8parse" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.3.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc" [[package]] name = "version_check" @@ -849,66 +792,6 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "wasm-bindgen" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" - [[package]] name = "winapi" version = "0.3.9" @@ -925,15 +808,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -942,9 +816,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.42.0" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -957,42 +840,42 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/rust-src/Cargo.toml b/rust-src/Cargo.toml index 92a3853..e88c05f 100644 --- a/rust-src/Cargo.toml +++ b/rust-src/Cargo.toml @@ -3,6 +3,7 @@ name = "bcachefs-rust" version = "0.3.1" authors = ["Yuxuan Shui ", "Kayla Firestack "] edition = "2018" +rust-version = "1.65" [lib] crate-type = ["staticlib"] @@ -10,9 +11,10 @@ crate-type = ["staticlib"] [dependencies] atty = "0.2.14" log = { version = "0.4", features = ["std"] } -chrono = "0.4" +chrono = { version = "0.4", default-features = false } colored = "2" clap = { version = "4.0.32", features = ["derive", "wrap_help"] } +clap_complete = "4.4.4" anyhow = "1.0" libc = "0.2.69" udev = "0.7.0" diff --git a/rust-src/bch_bindgen/Cargo.lock b/rust-src/bch_bindgen/Cargo.lock index b274197..a89ef60 100644 --- a/rust-src/bch_bindgen/Cargo.lock +++ b/rust-src/bch_bindgen/Cargo.lock @@ -3,30 +3,19 @@ version = 3 [[package]] -name = "android_system_properties" -version = "0.1.5" +name = "aho-corasick" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ - "libc", + "memchr", ] [[package]] name = "anyhow" -version = "1.0.69" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "autocfg" @@ -41,7 +30,7 @@ dependencies = [ "anyhow", "bindgen", "bitfield", - "bitflags", + "bitflags 1.3.2", "byteorder", "chrono", "colored", @@ -59,7 +48,7 @@ name = "bindgen" version = "0.64.0" source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cexpr", "clang-sys", "lazy_static", @@ -70,7 +59,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn", + "syn 1.0.109", ] [[package]] @@ -86,22 +75,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "bumpalo" -version = "3.12.0" +name = "bitflags" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "cc" -version = "1.0.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cexpr" @@ -120,129 +103,49 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ - "iana-time-zone", - "js-sys", - "num-integer", "num-traits", - "time", - "wasm-bindgen", - "winapi", ] [[package]] name = "clang-sys" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" dependencies = [ "glob", "libc", ] -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "colored" -version = "2.0.0" +version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" +checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" dependencies = [ - "atty", + "is-terminal", "lazy_static", - "winapi", -] - -[[package]] -name = "core-foundation-sys" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" - -[[package]] -name = "cxx" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "windows-sys", ] [[package]] name = "errno" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" -dependencies = [ - "errno-dragonfly", - "libc", - "winapi", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" dependencies = [ - "cc", "libc", + "windows-sys", ] [[package]] name = "fastrand" -version = "1.9.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "filedescriptor" @@ -273,63 +176,19 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "iana-time-zone" -version = "0.1.53" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "winapi", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" -dependencies = [ - "cxx", - "cxx-build", -] - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.5" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" -dependencies = [ - "libc", - "windows-sys 0.45.0", -] +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] -name = "js-sys" -version = "0.3.61" +name = "is-terminal" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ - "wasm-bindgen", + "hermit-abi", + "rustix", + "windows-sys", ] [[package]] @@ -346,9 +205,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libudev-sys" @@ -360,35 +219,17 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "link-cplusplus" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" -dependencies = [ - "cc", -] - [[package]] name = "linux-raw-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" - -[[package]] -name = "log" -version = "0.4.17" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memoffset" @@ -415,36 +256,20 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "num-integer" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" -dependencies = [ - "autocfg", - "num-traits", -] - [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] -[[package]] -name = "once_cell" -version = "1.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" - [[package]] name = "paste" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "peeking_take_while" @@ -454,51 +279,65 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "redox_syscall" -version = "0.2.16" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.7.1" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rustc-hash" @@ -508,29 +347,22 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.36.8" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ - "bitflags", + "bitflags 2.4.1", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys", ] -[[package]] -name = "scratch" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" - [[package]] name = "shlex" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" [[package]] name = "syn" @@ -544,56 +376,47 @@ dependencies = [ ] [[package]] -name = "tempfile" -version = "3.4.0" +name = "syn" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ - "cfg-if", - "fastrand", - "redox_syscall", - "rustix", - "windows-sys 0.42.0", + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] -name = "termcolor" -version = "1.2.0" +name = "tempfile" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ - "winapi-util", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", ] [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", - "syn", -] - -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi", - "winapi", + "syn 2.0.38", ] [[package]] @@ -609,81 +432,15 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" - -[[package]] -name = "unicode-width" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "uuid" -version = "1.3.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "wasm-bindgen" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc" [[package]] name = "winapi" @@ -701,15 +458,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -718,33 +466,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -757,42 +490,42 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/rust-src/bch_bindgen/Cargo.toml b/rust-src/bch_bindgen/Cargo.toml index cb341ee..da6654c 100644 --- a/rust-src/bch_bindgen/Cargo.toml +++ b/rust-src/bch_bindgen/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["lib"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -chrono = "0.4" +chrono = { version = "0.4", default-features = false } colored = "2" anyhow = "1.0" udev = "0.7.0" @@ -18,7 +18,7 @@ bitfield = "0.14.0" memoffset = "0.8.0" byteorder = "1.3" libc = "0.2.69" -gag = "1.0.0" +gag = { version = "1.0.0", default-features = false } bitflags = "1.3.2" paste = "1.0.11" diff --git a/rust-src/bch_bindgen/build.rs b/rust-src/bch_bindgen/build.rs index 92ec3ce..819b337 100644 --- a/rust-src/bch_bindgen/build.rs +++ b/rust-src/bch_bindgen/build.rs @@ -60,6 +60,7 @@ fn main() { .allowlist_var("KEY_SPEC_.*") .allowlist_var("Fix753_FMODE_.*") .allowlist_var("bch.*") + .allowlist_var("__bch2.*") .allowlist_var("__BTREE_ITER.*") .allowlist_var("BTREE_ITER.*") .blocklist_item("bch2_bkey_ops") diff --git a/rust-src/bch_bindgen/src/bcachefs.rs b/rust-src/bch_bindgen/src/bcachefs.rs index fa8dbde..8e897c0 100644 --- a/rust-src/bch_bindgen/src/bcachefs.rs +++ b/rust-src/bch_bindgen/src/bcachefs.rs @@ -43,7 +43,7 @@ impl PartialEq for bch_sb { impl bch_sb { pub fn crypt(&self) -> Option<&bch_sb_field_crypt> { unsafe { - let ptr = bch2_sb_field_get( + let ptr = bch2_sb_field_get_id( self as *const _ as *mut _, bch_sb_field_type::BCH_SB_FIELD_crypt, ) as *const u8; diff --git a/rust-src/bch_bindgen/src/bkey.rs b/rust-src/bch_bindgen/src/bkey.rs index 64697ea..d483083 100644 --- a/rust-src/bch_bindgen/src/bkey.rs +++ b/rust-src/bch_bindgen/src/bkey.rs @@ -47,6 +47,8 @@ pub enum BkeyValC<'a> { inode_v3(&'a c::bch_inode_v3), bucket_gens(&'a c::bch_bucket_gens), snapshot_tree(&'a c::bch_snapshot_tree), + logged_op_truncate(&'a c::bch_logged_op_truncate), + logged_op_finsert(&'a c::bch_logged_op_finsert), } impl<'a, 'b> BkeySC<'a> { @@ -96,6 +98,8 @@ impl<'a, 'b> BkeySC<'a> { KEY_TYPE_inode_v3 => inode_v3(unsafe { transmute(self.v) }), KEY_TYPE_bucket_gens => bucket_gens(unsafe { transmute(self.v) }), KEY_TYPE_snapshot_tree => snapshot_tree(unsafe { transmute(self.v) }), + KEY_TYPE_logged_op_truncate => logged_op_truncate(unsafe { transmute(self.v) }), + KEY_TYPE_logged_op_finsert => logged_op_finsert(unsafe { transmute(self.v) }), KEY_TYPE_MAX => unreachable!(), } } diff --git a/rust-src/bch_bindgen/src/btree.rs b/rust-src/bch_bindgen/src/btree.rs index 32b4e74..f738a46 100644 --- a/rust-src/bch_bindgen/src/btree.rs +++ b/rust-src/bch_bindgen/src/btree.rs @@ -11,24 +11,21 @@ use std::ptr; use bitflags::bitflags; pub struct BtreeTrans<'f> { - raw: c::btree_trans, + raw: *mut c::btree_trans, fs: PhantomData<&'f Fs> } impl<'f> BtreeTrans<'f> { pub fn new(fs: &'f Fs) -> BtreeTrans { unsafe { - let mut trans: MaybeUninit = MaybeUninit::uninit(); - - c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0); - BtreeTrans { raw: trans.assume_init(), fs: PhantomData } + BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData } } } } impl<'f> Drop for BtreeTrans<'f> { fn drop(&mut self) { - unsafe { c::bch2_trans_exit(&mut self.raw) } + unsafe { c::bch2_trans_put(&mut *self.raw) } } } @@ -64,9 +61,9 @@ impl<'t> BtreeIter<'t> { let mut iter: MaybeUninit = MaybeUninit::uninit(); c::bch2_trans_iter_init_outlined( - ptr::addr_of!(trans.raw).cast_mut(), + trans.raw, iter.as_mut_ptr(), - btree as u32, + btree, pos, flags.bits as u32); @@ -123,7 +120,7 @@ impl<'t> BtreeNodeIter<'t> { unsafe { let mut iter: MaybeUninit = MaybeUninit::uninit(); c::bch2_trans_node_iter_init( - ptr::addr_of!(trans.raw).cast_mut(), + trans.raw, iter.as_mut_ptr(), btree, pos, diff --git a/rust-src/bch_bindgen/src/lib.rs b/rust-src/bch_bindgen/src/lib.rs index 73aeef6..4c54944 100644 --- a/rust-src/bch_bindgen/src/lib.rs +++ b/rust-src/bch_bindgen/src/lib.rs @@ -62,7 +62,7 @@ use std::fmt; impl fmt::Display for c::btree_id { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let s = unsafe { CStr::from_ptr(*c::bch2_btree_ids.get_unchecked(*self as usize)) }; + let s = unsafe { CStr::from_ptr(c::bch2_btree_id_str(*self)) }; let s = s.to_str().unwrap(); write!(f, "{}", s) } @@ -92,7 +92,7 @@ impl FromStr for c::btree_id { let s = CString::new(s).unwrap(); let p = s.as_ptr(); - let v = unsafe {c::match_string(c::bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)}; + let v = unsafe {c::match_string(c::__bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)}; if v >= 0 { Ok(unsafe { std::mem::transmute(v) }) } else { diff --git a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h index e7bcfcf..e68de66 100644 --- a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h +++ b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h @@ -13,8 +13,8 @@ #include "../include/linux/blkdev.h" -#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name; +#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name; -MARK_FIX_753(FMODE_READ); -MARK_FIX_753(FMODE_WRITE); -MARK_FIX_753(FMODE_EXCL); \ No newline at end of file +MARK_FIX_753(BLK_OPEN_READ); +MARK_FIX_753(BLK_OPEN_WRITE); +MARK_FIX_753(BLK_OPEN_EXCL); diff --git a/rust-src/src/cmd_list.rs b/rust-src/src/cmd_list.rs index 3f86b8c..a0d0591 100644 --- a/rust-src/src/cmd_list.rs +++ b/rust-src/src/cmd_list.rs @@ -8,9 +8,9 @@ use bch_bindgen::btree::BtreeTrans; use bch_bindgen::btree::BtreeIter; use bch_bindgen::btree::BtreeNodeIter; use bch_bindgen::btree::BtreeIterFlags; -use clap::Parser; -use std::ffi::{CStr, OsStr, c_int, c_char}; -use std::os::unix::ffi::OsStrExt; +use clap::{Args, Parser}; +use std::ffi::{c_int, c_char}; +use crate::transform_c_args; fn list_keys(fs: &Fs, opt: Cli) -> anyhow::Result<()> { let trans = BtreeTrans::new(fs); @@ -84,7 +84,7 @@ fn list_nodes_ondisk(fs: &Fs, opt: Cli) -> anyhow::Result<()> { Ok(()) } -#[derive(Clone, clap::ValueEnum)] +#[derive(Clone, clap::ValueEnum, Debug)] enum Mode { Keys, Formats, @@ -92,8 +92,9 @@ enum Mode { NodesOndisk, } -#[derive(Parser)] -struct Cli { +/// List filesystem metadata in textual form +#[derive(Parser, Debug)] +pub struct Cli { /// Btree to list from #[arg(short, long, default_value_t=bcachefs::btree_id::BTREE_ID_extents)] btree: bcachefs::btree_id, @@ -120,7 +121,7 @@ struct Cli { /// Force color on/off. Default: autodetect tty #[arg(short, long, action = clap::ArgAction::Set, default_value_t=atty::is(Stream::Stdout))] colorize: bool, - + /// Verbose mode #[arg(short, long)] verbose: bool, @@ -157,12 +158,9 @@ fn cmd_list_inner(opt: Cli) -> anyhow::Result<()> { } #[no_mangle] +#[allow(clippy::not_unsafe_ptr_arg_deref)] pub extern "C" fn cmd_list(argc: c_int, argv: *const *const c_char) { - let argv: Vec<_> = (0..argc) - .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) }) - .map(|i| OsStr::from_bytes(i.to_bytes())) - .collect(); - + transform_c_args!(argv, argc, argv); let opt = Cli::parse_from(argv); colored::control::set_override(opt.colorize); if let Err(e) = cmd_list_inner(opt) { diff --git a/rust-src/src/cmd_mount.rs b/rust-src/src/cmd_mount.rs index 0150ffd..17a289c 100644 --- a/rust-src/src/cmd_mount.rs +++ b/rust-src/src/cmd_mount.rs @@ -1,11 +1,11 @@ use atty::Stream; use bch_bindgen::{bcachefs, bcachefs::bch_sb_handle}; use log::{info, debug, error, LevelFilter}; -use clap::Parser; +use clap::{Parser, Subcommand}; use uuid::Uuid; use std::path::PathBuf; -use crate::key; -use crate::key::KeyLoc; +use crate::{key, transform_c_args}; +use crate::key::KeyLocation; use crate::logger::SimpleLogger; use std::ffi::{CStr, CString, OsStr, c_int, c_char, c_void}; use std::os::unix::ffi::OsStrExt; @@ -14,7 +14,7 @@ fn mount_inner( src: String, target: impl AsRef, fstype: &str, - mountflags: u64, + mountflags: libc::c_ulong, data: Option, ) -> anyhow::Result<()> { @@ -45,7 +45,7 @@ fn mount_inner( /// Parse a comma-separated mount options and split out mountflags and filesystem /// specific options. -fn parse_mount_options(options: impl AsRef) -> (Option, u64) { +fn parse_mount_options(options: impl AsRef) -> (Option, libc::c_ulong) { use either::Either::*; debug!("parsing mount options: {}", options.as_ref()); let (opts, flags) = options @@ -129,7 +129,7 @@ fn get_devices_by_uuid(uuid: Uuid) -> anyhow::Result dev: String, @@ -145,7 +145,7 @@ struct Cli { /// Where the filesystem should be mounted. If not set, then the filesystem /// won't actually be mounted. But all steps preceeding mounting the /// filesystem (e.g. asking for passphrase) will still be performed. - mountpoint: std::path::PathBuf, + mountpoint: Option, /// Mount options #[arg(short, default_value = "")] @@ -199,34 +199,35 @@ fn cmd_mount_inner(opt: Cli) -> anyhow::Result<()> { if sbs.len() == 0 { Err(anyhow::anyhow!("No device found from specified parameters"))?; } else if unsafe { bcachefs::bch2_sb_is_encrypted(sbs[0].sb) } { - let key = opt - .key_location - .0 - .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?; - - key::prepare_key(&sbs[0], key)?; + key::prepare_key(&sbs[0], opt.key_location)?; } - info!( - "mounting with params: device: {}, target: {}, options: {}", - devs, - &opt.mountpoint.to_string_lossy(), - &opt.options - ); + if let Some(mountpoint) = opt.mountpoint { + info!( + "mounting with params: device: {}, target: {}, options: {}", + devs, + mountpoint.to_string_lossy(), + &opt.options + ); + + mount(devs, mountpoint, &opt.options)?; + } else { + info!( + "would mount with params: device: {}, options: {}", + devs, + &opt.options + ); + } - mount(devs, &opt.mountpoint, &opt.options)?; Ok(()) } #[no_mangle] -pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) { - let argv: Vec<_> = (0..argc) - .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) }) - .map(|i| OsStr::from_bytes(i.to_bytes())) - .collect(); - +#[allow(clippy::not_unsafe_ptr_arg_deref)] +pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) -> c_int { + transform_c_args!(argv, argc, argv); let opt = Cli::parse_from(argv); - + log::set_boxed_logger(Box::new(SimpleLogger)).unwrap(); // @TODO : more granular log levels via mount option @@ -239,7 +240,9 @@ pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) { colored::control::set_override(opt.colorize); if let Err(e) = cmd_mount_inner(opt) { error!("Fatal error: {}", e); + 1 } else { info!("Successfully mounted"); + 0 } } diff --git a/rust-src/src/key.rs b/rust-src/src/key.rs index 2b4fc45..93daa26 100644 --- a/rust-src/src/key.rs +++ b/rust-src/src/key.rs @@ -1,50 +1,64 @@ use log::{info}; use bch_bindgen::bcachefs::bch_sb_handle; +use clap::builder::PossibleValue; use crate::c_str; use anyhow::anyhow; #[derive(Clone, Debug)] pub enum KeyLocation { + None, Fail, Wait, Ask, } -#[derive(Clone, Debug)] -pub struct KeyLoc(pub Option); -impl std::ops::Deref for KeyLoc { - type Target = Option; - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::str::FromStr for KeyLoc { +impl std::str::FromStr for KeyLocation { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { - "" => Ok(KeyLoc(None)), - "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))), - "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))), - "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))), - _ => Err(anyhow!("invalid password option")), + ""|"none" => Ok(KeyLocation::None), + "fail" => Ok(KeyLocation::Fail), + "wait" => Ok(KeyLocation::Wait), + "ask" => Ok(KeyLocation::Ask), + _ => Err(anyhow!("invalid password option")), } } } +impl clap::ValueEnum for KeyLocation { + fn value_variants<'a>() -> &'a [Self] { + &[ + KeyLocation::None, + KeyLocation::Fail, + KeyLocation::Wait, + KeyLocation::Ask, + ] + } + + fn to_possible_value(&self) -> Option { + Some(match self { + Self::None => PossibleValue::new("none").alias(""), + Self::Fail => PossibleValue::new("fail"), + Self::Wait => PossibleValue::new("wait"), + Self::Ask => PossibleValue::new("ask"), + }) + } +} + fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result { use bch_bindgen::keyutils::{self, keyctl_search}; let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _; - let key_type = c_str!("logon"); + let key_type = c_str!("user"); let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) }; if key_id > 0 { info!("Key has became available"); Ok(true) - } else if errno::errno().0 != libc::ENOKEY { - Err(crate::ErrnoError(errno::errno()).into()) } else { - Ok(false) + match errno::errno().0 { + libc::ENOKEY | libc::EKEYREVOKED => Ok(false), + _ => Err(crate::ErrnoError(errno::errno()).into()), + } } } @@ -101,7 +115,7 @@ fn ask_for_key(sb: &bch_sb_handle) -> anyhow::Result<()> { } else if key.magic != bch_key_magic { Err(anyhow!("failed to verify the password")) } else { - let key_type = c_str!("logon"); + let key_type = c_str!("user"); let ret = unsafe { bch_bindgen::keyutils::add_key( key_type, @@ -125,5 +139,6 @@ pub fn prepare_key(sb: &bch_sb_handle, password: KeyLocation) -> anyhow::Result< KeyLocation::Fail => Err(anyhow!("no key available")), KeyLocation::Wait => Ok(wait_for_key(&sb.sb().uuid())?), KeyLocation::Ask => ask_for_key(sb), + _ => Err(anyhow!("no keyoption specified for locked filesystem")), } } diff --git a/rust-src/src/lib.rs b/rust-src/src/lib.rs index 159d049..64297b4 100644 --- a/rust-src/src/lib.rs +++ b/rust-src/src/lib.rs @@ -1,7 +1,24 @@ +use clap::Subcommand; + pub mod key; pub mod logger; pub mod cmd_mount; pub mod cmd_list; +pub mod cmd_completions; + +#[derive(clap::Parser, Debug)] +#[command(name = "bcachefs")] +pub struct Cli { + #[command(subcommand)] + subcommands: Subcommands, +} + +#[derive(Subcommand, Debug)] +enum Subcommands { + List(cmd_list::Cli), + Mount(cmd_mount::Cli), + Completions(cmd_completions::Cli), +} #[macro_export] macro_rules! c_str { @@ -14,6 +31,18 @@ macro_rules! c_str { }; } +#[macro_export] +macro_rules! transform_c_args { + ($var:ident, $argc:expr, $argv:expr) => { + // TODO: `OsStr::from_bytes` only exists on *nix + use ::std::os::unix::ffi::OsStrExt; + let $var: Vec<_> = (0..$argc) + .map(|i| unsafe { ::std::ffi::CStr::from_ptr(*$argv.add(i as usize)) }) + .map(|i| ::std::ffi::OsStr::from_bytes(i.to_bytes())) + .collect(); + }; +} + #[derive(Debug)] struct ErrnoError(errno::Errno); impl std::fmt::Display for ErrnoError { diff --git a/tools-util.c b/tools-util.c index 624656a..923a666 100644 --- a/tools-util.c +++ b/tools-util.c @@ -17,6 +17,7 @@ #include #include +#include "libbcachefs.h" #include "libbcachefs/bcachefs_ioctl.h" #include "linux/sort.h" #include "tools-util.h" @@ -186,7 +187,7 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[], } /* Returns size of file or block device: */ -u64 get_size(const char *path, int fd) +u64 get_size(int fd) { struct stat statbuf = xfstat(fd); @@ -199,7 +200,7 @@ u64 get_size(const char *path, int fd) } /* Returns blocksize, in bytes: */ -unsigned get_blocksize(const char *path, int fd) +unsigned get_blocksize(int fd) { struct stat statbuf = xfstat(fd); @@ -212,22 +213,24 @@ unsigned get_blocksize(const char *path, int fd) } /* Open a block device, do magic blkid stuff to probe for existing filesystems: */ -int open_for_format(const char *dev, bool force) +int open_for_format(struct dev_opts *dev, bool force) { blkid_probe pr; const char *fs_type = NULL, *fs_label = NULL; size_t fs_type_len, fs_label_len; - int fd = open(dev, O_RDWR|O_EXCL); - if (fd < 0) - die("Error opening device to format %s: %m", dev); + dev->bdev = blkdev_get_by_path(dev->path, BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL, + dev, NULL); + int ret = PTR_ERR_OR_ZERO(dev->bdev); + if (ret < 0) + die("Error opening device to format %s: %s", dev->path, strerror(-ret)); if (force) - return fd; + return 0; if (!(pr = blkid_new_probe())) die("blkid error 1"); - if (blkid_probe_set_device(pr, fd, 0, 0)) + if (blkid_probe_set_device(pr, dev->bdev->bd_buffered_fd, 0, 0)) die("blkid error 2"); if (blkid_probe_enable_partitions(pr, true)) die("blkid error 3"); @@ -240,19 +243,19 @@ int open_for_format(const char *dev, bool force) if (fs_type) { if (fs_label) printf("%s contains a %s filesystem labelled '%s'\n", - dev, fs_type, fs_label); + dev->path, fs_type, fs_label); else printf("%s contains a %s filesystem\n", - dev, fs_type); + dev->path, fs_type); fputs("Proceed anyway?", stdout); if (!ask_yn()) exit(EXIT_FAILURE); while (blkid_do_probe(pr) == 0) - blkid_do_wipe(pr, 0); + blkid_do_wipe(pr, 0); } blkid_free_probe(pr); - return fd; + return ret; } bool ask_yn(void) @@ -670,7 +673,11 @@ struct bbpos bbpos_parse(char *buf) if (!(field = strsep(&s, ":"))) die("invalid bbpos %s", buf); - ret.btree = read_string_list_or_die(field, bch2_btree_ids, "btree id"); + ret.btree = read_string_list_or_die(field, __bch2_btree_ids, "btree id"); + + if (!s) + die("invalid bbpos %s", buf); + ret.pos = bpos_parse(s); return ret; } diff --git a/tools-util.h b/tools-util.h index e7bdd2c..7a04c10 100644 --- a/tools-util.h +++ b/tools-util.h @@ -62,9 +62,10 @@ u64 read_file_u64(int, const char *); ssize_t read_string_list_or_die(const char *, const char * const[], const char *); -u64 get_size(const char *, int); -unsigned get_blocksize(const char *, int); -int open_for_format(const char *, bool); +u64 get_size(int); +unsigned get_blocksize(int); +struct dev_opts; +int open_for_format(struct dev_opts *, bool); bool ask_yn(void); -- 2.39.2