From 8f5545af53ee17857ceb2e38e5e4664287a8f296 Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Mon, 31 Oct 2022 11:53:37 +0200 Subject: [PATCH] New upstream release --- Makefile | 20 +- Makefile.compiler | 2 +- bcachefs.8 | 5 +- bcachefs.c | 11 +- ccan/darray/LICENSE | 17 - ccan/darray/_info | 57 - ccan/darray/darray.h | 355 ------ cmd_attr.c | 2 +- cmd_data.c | 6 +- cmd_device.c | 23 +- cmd_dump.c | 182 +++ cmd_format.c | 70 +- cmd_fs.c | 247 ++-- cmd_key.c | 12 +- cmd_debug.c => cmd_list.c | 316 +---- cmd_list_journal.c | 246 ++++ cmd_migrate.c | 16 +- cmd_option.c | 106 ++ cmds.h | 2 + crypto.c | 23 +- crypto.h | 2 +- debian/bcachefs-tools.postinst | 12 + debian/bcachefs-tools.postrm | 12 + debian/changelog | 7 + debian/control | 2 +- debian/files | 2 +- default.nix | 26 +- include/linux/bio.h | 54 +- include/linux/bitops.h | 5 + include/linux/blk_types.h | 3 + include/linux/blkdev.h | 5 +- include/linux/bug.h | 3 +- include/linux/errname.h | 11 + include/linux/freezer.h | 3 +- include/linux/generic-radix-tree.h | 73 +- include/linux/jiffies.h | 7 + include/linux/kernel.h | 11 + include/linux/kmemleak.h | 121 ++ include/linux/kobject.h | 8 +- include/linux/list.h | 1 + include/linux/mean_and_variance.h | 170 +++ include/linux/mm.h | 25 + include/linux/prandom.h | 27 + include/linux/prefetch.h | 3 + include/linux/pretty-printers.h | 10 + include/linux/printbuf.h | 306 +++++ include/linux/printk.h | 6 +- include/linux/random.h | 5 - include/linux/rwsem.h | 1 + include/linux/sched.h | 12 + include/linux/shrinker.h | 6 +- include/linux/six.h | 35 +- include/linux/slab.h | 67 +- include/linux/spinlock.h | 17 +- include/linux/string.h | 1 + include/linux/string_helpers.h | 20 + include/linux/sysfs.h | 4 + include/linux/types.h | 7 +- include/linux/vmalloc.h | 53 - include/linux/zstd.h | 449 ++++++- include/linux/zstd_errors.h | 77 ++ include/trace/events/bcachefs.h | 907 ++++++++------ libbcachefs.c | 511 +------- libbcachefs.h | 4 +- libbcachefs/acl.c | 6 +- libbcachefs/alloc_background.c | 1569 ++++++++++++++--------- libbcachefs/alloc_background.h | 190 ++- libbcachefs/alloc_foreground.c | 631 ++++++++-- libbcachefs/alloc_foreground.h | 24 + libbcachefs/alloc_types.h | 40 +- libbcachefs/backpointers.c | 1128 +++++++++++++++++ libbcachefs/backpointers.h | 38 + libbcachefs/bbpos.h | 48 + libbcachefs/bcachefs.h | 116 +- libbcachefs/bcachefs_format.h | 383 +++++- libbcachefs/bcachefs_ioctl.h | 11 +- libbcachefs/bkey.c | 196 +-- libbcachefs/bkey.h | 110 +- libbcachefs/bkey_buf.h | 1 + libbcachefs/bkey_cmp.h | 129 ++ libbcachefs/bkey_methods.c | 214 ++-- libbcachefs/bkey_methods.h | 115 +- libbcachefs/bkey_sort.c | 3 +- libbcachefs/bset.c | 64 +- libbcachefs/bset.h | 94 -- libbcachefs/btree_cache.c | 473 ++++--- libbcachefs/btree_cache.h | 11 +- libbcachefs/btree_gc.c | 964 ++++++++------- libbcachefs/btree_gc.h | 7 + libbcachefs/btree_io.c | 392 +++--- libbcachefs/btree_io.h | 62 +- libbcachefs/btree_iter.c | 1782 ++++++++++++--------------- libbcachefs/btree_iter.h | 298 ++++- libbcachefs/btree_key_cache.c | 556 ++++++--- libbcachefs/btree_key_cache.h | 10 +- libbcachefs/btree_locking.c | 679 ++++++++++ libbcachefs/btree_locking.h | 385 ++++-- libbcachefs/btree_types.h | 258 ++-- libbcachefs/btree_update.h | 56 +- libbcachefs/btree_update_interior.c | 849 ++++++++----- libbcachefs/btree_update_interior.h | 10 +- libbcachefs/btree_update_leaf.c | 930 ++++++++------ libbcachefs/buckets.c | 1093 +++++++--------- libbcachefs/buckets.h | 192 +-- libbcachefs/buckets_types.h | 38 +- libbcachefs/chardev.c | 9 +- libbcachefs/checksum.c | 138 ++- libbcachefs/checksum.h | 6 +- libbcachefs/clock.c | 2 +- libbcachefs/compress.c | 22 +- libbcachefs/counters.c | 107 ++ libbcachefs/counters.h | 17 + libbcachefs/darray.h | 77 ++ libbcachefs/data_update.c | 388 ++++++ libbcachefs/data_update.h | 40 + libbcachefs/debug.c | 551 +++++++-- libbcachefs/dirent.c | 83 +- libbcachefs/dirent.h | 2 +- libbcachefs/disk_groups.c | 127 +- libbcachefs/disk_groups.h | 6 +- libbcachefs/ec.c | 320 +++-- libbcachefs/ec.h | 12 +- libbcachefs/errcode.c | 62 + libbcachefs/errcode.h | 95 +- libbcachefs/error.c | 156 ++- libbcachefs/error.h | 58 +- libbcachefs/extent_update.c | 13 +- libbcachefs/extents.c | 331 +++-- libbcachefs/extents.h | 35 +- libbcachefs/fs-common.c | 17 +- libbcachefs/fs-io.c | 441 ++++--- libbcachefs/fs-io.h | 11 +- libbcachefs/fs-ioctl.c | 66 +- libbcachefs/fs.c | 87 +- libbcachefs/fs.h | 4 +- libbcachefs/fsck.c | 1073 ++++++++-------- libbcachefs/inode.c | 342 +++-- libbcachefs/inode.h | 60 +- libbcachefs/io.c | 291 +++-- libbcachefs/io.h | 5 +- libbcachefs/journal.c | 855 +++++++------ libbcachefs/journal.h | 126 +- libbcachefs/journal_io.c | 821 ++++++------ libbcachefs/journal_io.h | 19 +- libbcachefs/journal_reclaim.c | 132 +- libbcachefs/journal_sb.c | 220 ++++ libbcachefs/journal_sb.h | 24 + libbcachefs/journal_seq_blacklist.c | 11 +- libbcachefs/journal_types.h | 55 +- libbcachefs/keylist.c | 1 + libbcachefs/lru.c | 206 ++++ libbcachefs/lru.h | 19 + libbcachefs/migrate.c | 126 +- libbcachefs/move.c | 1095 ++++++++-------- libbcachefs/move.h | 74 +- libbcachefs/movinggc.c | 327 ++--- libbcachefs/movinggc.h | 1 + libbcachefs/opts.c | 138 ++- libbcachefs/opts.h | 109 +- libbcachefs/quota.c | 353 ++++-- libbcachefs/quota.h | 2 +- libbcachefs/rebalance.c | 141 ++- libbcachefs/recovery.c | 614 +++++---- libbcachefs/recovery.h | 20 +- libbcachefs/reflink.c | 96 +- libbcachefs/reflink.h | 23 +- libbcachefs/replicas.c | 132 +- libbcachefs/replicas.h | 2 + libbcachefs/siphash.c | 2 +- libbcachefs/str_hash.h | 68 +- libbcachefs/subvolume.c | 603 ++++----- libbcachefs/subvolume.h | 79 +- libbcachefs/subvolume_types.h | 8 +- libbcachefs/super-io.c | 630 +++++++--- libbcachefs/super-io.h | 14 +- libbcachefs/super.c | 420 +++---- libbcachefs/super.h | 8 +- libbcachefs/super_types.h | 1 + libbcachefs/sysfs.c | 446 +++---- libbcachefs/sysfs.h | 14 +- libbcachefs/tests.c | 269 ++-- libbcachefs/trace.c | 6 +- libbcachefs/util.c | 502 +++++--- libbcachefs/util.h | 135 +- libbcachefs/varint.c | 1 + libbcachefs/vstructs.h | 2 +- libbcachefs/xattr.c | 113 +- libbcachefs/xattr.h | 2 +- linux/bio.c | 172 ++- linux/blkdev.c | 23 +- linux/generic-radix-tree.c | 94 +- linux/int_sqrt.c | 71 ++ linux/kthread.c | 2 + linux/mean_and_variance.c | 178 +++ linux/pretty-printers.c | 60 + linux/printbuf.c | 368 ++++++ linux/printbuf_userspace.c | 29 + linux/ratelimit.c | 69 ++ linux/shrinker.c | 53 +- linux/six.c | 526 ++++---- linux/string.c | 27 + linux/string_helpers.c | 131 ++ linux/timer.c | 8 +- linux/zstd_compress_module.c | 157 +++ linux/zstd_decompress_module.c | 103 ++ nix/overlay.nix | 1 - qcow2.c | 2 +- shell.nix | 18 + tests/valgrind-suppressions.txt | 21 + tools-util.c | 77 +- tools-util.h | 33 +- 211 files changed, 22544 insertions(+), 12762 deletions(-) delete mode 100644 ccan/darray/LICENSE delete mode 100644 ccan/darray/_info delete mode 100644 ccan/darray/darray.h create mode 100644 cmd_dump.c rename cmd_debug.c => cmd_list.c (57%) create mode 100644 cmd_list_journal.c create mode 100644 cmd_option.c create mode 100644 debian/bcachefs-tools.postinst create mode 100644 debian/bcachefs-tools.postrm create mode 100644 include/linux/errname.h create mode 100644 include/linux/kmemleak.h create mode 100644 include/linux/mean_and_variance.h create mode 100644 include/linux/mm.h create mode 100644 include/linux/prandom.h create mode 100644 include/linux/pretty-printers.h create mode 100644 include/linux/printbuf.h create mode 100644 include/linux/string_helpers.h create mode 100644 include/linux/zstd_errors.h create mode 100644 libbcachefs/backpointers.c create mode 100644 libbcachefs/backpointers.h create mode 100644 libbcachefs/bbpos.h create mode 100644 libbcachefs/bkey_cmp.h create mode 100644 libbcachefs/btree_locking.c create mode 100644 libbcachefs/counters.c create mode 100644 libbcachefs/counters.h create mode 100644 libbcachefs/darray.h create mode 100644 libbcachefs/data_update.c create mode 100644 libbcachefs/data_update.h create mode 100644 libbcachefs/errcode.c create mode 100644 libbcachefs/journal_sb.c create mode 100644 libbcachefs/journal_sb.h create mode 100644 libbcachefs/lru.c create mode 100644 libbcachefs/lru.h create mode 100644 linux/int_sqrt.c create mode 100644 linux/mean_and_variance.c create mode 100644 linux/pretty-printers.c create mode 100644 linux/printbuf.c create mode 100644 linux/printbuf_userspace.c create mode 100644 linux/ratelimit.c create mode 100644 linux/string_helpers.c create mode 100644 linux/zstd_compress_module.c create mode 100644 linux/zstd_decompress_module.c create mode 100644 shell.nix diff --git a/Makefile b/Makefile index e49534e..d460a6d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ PREFIX?=/usr/local PKG_CONFIG?=pkg-config INSTALL=install -CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \ +CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \ -Wno-pointer-sign \ -fno-strict-aliasing \ -fno-delete-null-pointer-checks \ @@ -47,7 +47,7 @@ CFLAGS+=$(call cc-disable-warning, zero-length-array) CFLAGS+=$(call cc-disable-warning, shift-overflow) CFLAGS+=$(call cc-disable-warning, enum-conversion) -PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev" +PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev libkeyutils" ifdef BCACHEFS_FUSE PKGCONFIG_LIBS+="fuse3 >= 3.7" CFLAGS+=-DBCACHEFS_FUSE @@ -189,6 +189,22 @@ update-bcachefs-sources: git add include/linux/list_nulls.h cp $(LINUX_DIR)/include/linux/poison.h include/linux/ git add include/linux/poison.h + cp $(LINUX_DIR)/include/linux/generic-radix-tree.h include/linux/ + git add include/linux/generic-radix-tree.h + cp $(LINUX_DIR)/lib/generic-radix-tree.c linux/ + git add linux/generic-radix-tree.c + cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/ + git add include/linux/kmemleak.h + cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/ + git add include/linux/printbuf.h + cp $(LINUX_DIR)/lib/printbuf.c linux/ + git add linux/printbuf.c + cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ + git add linux/mean_and_variance.c + cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ + git add include/linux/mean_and_variance.h + cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ + git add linux/int_sqrt.c cp $(LINUX_DIR)/scripts/Makefile.compiler ./ git add Makefile.compiler $(RM) libbcachefs/*.mod.c diff --git a/Makefile.compiler b/Makefile.compiler index 86ecd2a..94d0d40 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -21,8 +21,8 @@ TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$ # automatically cleaned up. try-run = $(shell set -e; \ TMP=$(TMPOUT)/tmp; \ - mkdir -p $(TMPOUT); \ trap "rm -rf $(TMPOUT)" EXIT; \ + mkdir -p $(TMPOUT); \ if ($(1)) >/dev/null 2>&1; \ then echo "$(2)"; \ else echo "$(3)"; \ diff --git a/bcachefs.8 b/bcachefs.8 index 874068c..d5c4e89 100644 --- a/bcachefs.8 +++ b/bcachefs.8 @@ -99,7 +99,7 @@ Format one or a list of devices with bcachefs data structures. You need to do this before you create a volume. .Pp Device specific options must come before corresponding devices, e.g. -.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb +.Dl bcachefs format --label=ssd /dev/sda --label=hdd /dev/sdb .Bl -tag -width Ds .It Fl b , Fl -block Ns = Ns Ar size block size, in bytes (e.g. 4k) @@ -231,8 +231,9 @@ Force, if data redundancy will be degraded .El .It Nm Ic device Ic evacuate Ar device Move data off of a given device -.It Nm Ic device Ic set-state Oo Ar options Oc Ar device Ar new-state +.It Nm Ic device Ic set-state Oo Ar options Oc Ar new-state Ar device .Bl -tag -width Ds +.It Ar new-state Ns = Ns ( Ar rw | ro | failed | spare ) .It Fl f , Fl -force Force, if data redundancy will be degraded .El diff --git a/bcachefs.c b/bcachefs.c index 4f2cd55..31d9628 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -33,6 +33,7 @@ static void usage(void) "Superblock commands:\n" " format Format a new filesystem\n" " show-super Dump superblock information to stdout\n" + " set-option Set a filesystem option\n" "\n" "Repair:\n" " fsck Check an existing filesystem for errors\n" @@ -59,9 +60,9 @@ static void usage(void) " device resize-journal Resize journal on a device\n" "\n" "Commands for managing subvolumes and snapshots:\n" - " subvolume create Create a new subvolume\n" - " subvolume delete Delete an existing subvolume\n" - " subvolume snapshot Create a snapshot\n" + " subvolume create Create a new subvolume\n" + " subvolume delete Delete an existing subvolume\n" + " subvolume snapshot Create a snapshot\n" "\n" "Commands for managing filesystem data:\n" " data rereplicate Rereplicate degraded data\n" @@ -199,6 +200,8 @@ int main(int argc, char *argv[]) return cmd_version(argc, argv); if (!strcmp(cmd, "show-super")) return cmd_show_super(argc, argv); + if (!strcmp(cmd, "set-option")) + return cmd_set_option(argc, argv); if (argc < 2) { printf("%s: missing command\n", argv[0]); @@ -235,6 +238,8 @@ int main(int argc, char *argv[]) return cmd_list(argc, argv); if (!strcmp(cmd, "list_journal")) return cmd_list_journal(argc, argv); + if (!strcmp(cmd, "kill_btree_node")) + return cmd_kill_btree_node(argc, argv); if (!strcmp(cmd, "setattr")) return cmd_setattr(argc, argv); diff --git a/ccan/darray/LICENSE b/ccan/darray/LICENSE deleted file mode 100644 index 89de354..0000000 --- a/ccan/darray/LICENSE +++ /dev/null @@ -1,17 +0,0 @@ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/ccan/darray/_info b/ccan/darray/_info deleted file mode 100644 index b6d5e4b..0000000 --- a/ccan/darray/_info +++ /dev/null @@ -1,57 +0,0 @@ -#include "config.h" -#include -#include - -#include "ccan/darray/darray.h" - -/** - * darray - Generic resizable arrays - * - * darray is a set of macros for managing dynamically-allocated arrays. - * It removes the tedium of managing realloc'd arrays with pointer, size, and - * allocated size. - * - * Example: - * #include - * #include - * - * int main(void) { - * darray(int) numbers = darray_new(); - * char buffer[32]; - * - * for (;;) { - * int *i; - * darray_foreach(i, numbers) - * printf("%d ", *i); - * if (darray_size(numbers) > 0) - * puts(""); - * - * printf("darray> "); - * fgets(buffer, sizeof(buffer), stdin); - * if (*buffer == '\0' || *buffer == '\n') - * break; - * - * darray_append(numbers, atoi(buffer)); - * } - * - * darray_free(numbers); - * - * return 0; - * } - * - * Author: Joey Adams - * License: MIT - * Version: 0.2 - */ -int main(int argc, char *argv[]) -{ - if (argc != 2) - return 1; - - if (strcmp(argv[1], "depends") == 0) { - /* Nothing. */ - return 0; - } - - return 1; -} diff --git a/ccan/darray/darray.h b/ccan/darray/darray.h deleted file mode 100644 index 7511241..0000000 --- a/ccan/darray/darray.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (C) 2011 Joseph Adams - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef CCAN_DARRAY_H -#define CCAN_DARRAY_H - -#include -#include -#include "config.h" - -/* - * SYNOPSIS - * - * Life cycle of a darray (dynamically-allocated array): - * - * darray(int) a = darray_new(); - * darray_free(a); - * - * struct {darray(int) a;} foo; - * darray_init(foo.a); - * darray_free(foo.a); - * - * Typedefs for darrays of common types: - * - * darray_char, darray_schar, darray_uchar - * darray_short, darray_int, darray_long - * darray_ushort, darray_uint, darray_ulong - * - * Access: - * - * T darray_item(darray(T) arr, size_t index); - * size_t darray_size(darray(T) arr); - * size_t darray_alloc(darray(T) arr); - * bool darray_empty(darray(T) arr); - * - * Insertion (single item): - * - * void darray_append(darray(T) arr, T item); - * void darray_prepend(darray(T) arr, T item); - * void darray_push(darray(T) arr, T item); // same as darray_append - * - * Insertion (multiple items): - * - * void darray_append_items(darray(T) arr, T *items, size_t count); - * void darray_prepend_items(darray(T) arr, T *items, size_t count); - * - * void darray_appends(darray(T) arr, [T item, [...]]); - * void darray_prepends(darray(T) arr, [T item, [...]]); - * - * // Same functionality as above, but does not require typeof. - * void darray_appends_t(darray(T) arr, #T, [T item, [...]]); - * void darray_prepends_t(darray(T) arr, #T, [T item, [...]]); - * - * Removal: - * - * T darray_pop(darray(T) arr | darray_size(arr) != 0); - * T* darray_pop_check(darray(T*) arr); - * void darray_remove(darray(T) arr, size_t index); - * - * Replacement: - * - * void darray_from_items(darray(T) arr, T *items, size_t count); - * void darray_from_c(darray(T) arr, T c_array[N]); - * - * String buffer: - * - * void darray_append_string(darray(char) arr, const char *str); - * void darray_append_lit(darray(char) arr, char stringLiteral[N+1]); - * - * void darray_prepend_string(darray(char) arr, const char *str); - * void darray_prepend_lit(darray(char) arr, char stringLiteral[N+1]); - * - * void darray_from_string(darray(T) arr, const char *str); - * void darray_from_lit(darray(char) arr, char stringLiteral[N+1]); - * - * Size management: - * - * void darray_resize(darray(T) arr, size_t newSize); - * void darray_resize0(darray(T) arr, size_t newSize); - * - * void darray_realloc(darray(T) arr, size_t newAlloc); - * void darray_growalloc(darray(T) arr, size_t newAlloc); - * - * void darray_make_room(darray(T) arr, size_t room); - * - * Traversal: - * - * darray_foreach(T *&i, darray(T) arr) {...} - * darray_foreach_reverse(T *&i, darray(T) arr) {...} - * - * Except for darray_foreach, darray_foreach_reverse, and darray_remove, - * all macros evaluate their non-darray arguments only once. - */ - -/*** Life cycle ***/ - -#define darray(type) struct {type *item; size_t size; size_t alloc;} - -#define darray_new() {0,0,0} -#define darray_init(arr) do {(arr).item=0; (arr).size=0; (arr).alloc=0;} while(0) -#define darray_free(arr) do {free((arr).item);} while(0) - - -/* - * Typedefs for darrays of common types. These are useful - * when you want to pass a pointer to an darray(T) around. - * - * The following will produce an incompatible pointer warning: - * - * void foo(darray(int) *arr); - * darray(int) arr = darray_new(); - * foo(&arr); - * - * The workaround: - * - * void foo(darray_int *arr); - * darray_int arr = darray_new(); - * foo(&arr); - */ - -typedef darray(char) darray_char; -typedef darray(signed char) darray_schar; -typedef darray(unsigned char) darray_uchar; - -typedef darray(short) darray_short; -typedef darray(int) darray_int; -typedef darray(long) darray_long; - -typedef darray(unsigned short) darray_ushort; -typedef darray(unsigned int) darray_uint; -typedef darray(unsigned long) darray_ulong; - - -/*** Access ***/ - -#define darray_item(arr, i) ((arr).item[i]) -#define darray_size(arr) ((arr).size) -#define darray_alloc(arr) ((arr).alloc) -#define darray_empty(arr) ((arr).size == 0) - - -/*** Insertion (single item) ***/ - -#define darray_append(arr, ...) do { \ - darray_resize(arr, (arr).size+1); \ - (arr).item[(arr).size-1] = (__VA_ARGS__); \ - } while(0) -#define darray_prepend(arr, ...) do { \ - darray_resize(arr, (arr).size+1); \ - memmove((arr).item+1, (arr).item, ((arr).size-1)*sizeof(*(arr).item)); \ - (arr).item[0] = (__VA_ARGS__); \ - } while(0) -#define darray_push(arr, ...) darray_append(arr, __VA_ARGS__) - - -/*** Insertion (multiple items) ***/ - -#define darray_append_items(arr, items, count) do { \ - size_t __count = (count), __oldSize = (arr).size; \ - darray_resize(arr, __oldSize + __count); \ - memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \ - } while(0) - -#define darray_prepend_items(arr, items, count) do { \ - size_t __count = (count), __oldSize = (arr).size; \ - darray_resize(arr, __count + __oldSize); \ - memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \ - memcpy((arr).item, items, __count * sizeof(*(arr).item)); \ - } while(0) - -#define darray_append_items_nullterminate(arr, items, count) do { \ - size_t __count = (count), __oldSize = (arr).size; \ - darray_resize(arr, __oldSize + __count + 1); \ - memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \ - (arr).item[--(arr).size] = 0; \ - } while(0) - -#define darray_prepend_items_nullterminate(arr, items, count) do { \ - size_t __count = (count), __oldSize = (arr).size; \ - darray_resize(arr, __count + __oldSize + 1); \ - memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \ - memcpy((arr).item, items, __count * sizeof(*(arr).item)); \ - (arr).item[--(arr).size] = 0; \ - } while(0) - -#if HAVE_TYPEOF -#define darray_appends(arr, ...) darray_appends_t(arr, typeof((*(arr).item)), __VA_ARGS__) -#define darray_prepends(arr, ...) darray_prepends_t(arr, typeof((*(arr).item)), __VA_ARGS__) -#endif - -#define darray_appends_t(arr, type, ...) do { \ - type __src[] = {__VA_ARGS__}; \ - darray_append_items(arr, __src, sizeof(__src)/sizeof(*__src)); \ - } while(0) -#define darray_prepends_t(arr, type, ...) do { \ - type __src[] = {__VA_ARGS__}; \ - darray_prepend_items(arr, __src, sizeof(__src)/sizeof(*__src)); \ - } while(0) - - -/*** Removal ***/ - -/* Warning: Do not call darray_pop on an empty darray. */ -#define darray_pop(arr) ((arr).item[--(arr).size]) -#define darray_pop_check(arr) ((arr).size ? darray_pop(arr) : NULL) -/* Warning, slow: Requires copying all elements after removed item. */ -#define darray_remove(arr, index) do { \ - if (index < arr.size-1) \ - memmove(&(arr).item[index], &(arr).item[index+1], ((arr).size-1-i)*sizeof(*(arr).item)); \ - (arr).size--; \ - } while(0) - - -/*** Replacement ***/ - -#define darray_from_items(arr, items, count) do {size_t __count = (count); darray_resize(arr, __count); memcpy((arr).item, items, __count*sizeof(*(arr).item));} while(0) -#define darray_from_c(arr, c_array) darray_from_items(arr, c_array, sizeof(c_array)/sizeof(*(c_array))) - - -/*** String buffer ***/ - -#define darray_append_string(arr, str) do {const char *__str = (str); darray_append_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0) -#define darray_append_lit(arr, stringLiteral) do {darray_append_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0) - -#define darray_prepend_string(arr, str) do { \ - const char *__str = (str); \ - darray_prepend_items_nullterminate(arr, __str, strlen(__str)); \ - } while(0) -#define darray_prepend_lit(arr, stringLiteral) \ - darray_prepend_items_nullterminate(arr, stringLiteral, sizeof(stringLiteral) - 1) - -#define darray_from_string(arr, str) do {const char *__str = (str); darray_from_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0) -#define darray_from_lit(arr, stringLiteral) do {darray_from_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0) - - -/*** Size management ***/ - -#define darray_resize(arr, newSize) darray_growalloc(arr, (arr).size = (newSize)) -#define darray_resize0(arr, newSize) do { \ - size_t __oldSize = (arr).size, __newSize = (newSize); \ - (arr).size = __newSize; \ - if (__newSize > __oldSize) { \ - darray_growalloc(arr, __newSize); \ - memset(&(arr).item[__oldSize], 0, (__newSize - __oldSize) * sizeof(*(arr).item)); \ - } \ - } while(0) - -#define darray_realloc(arr, newAlloc) do { \ - (arr).item = realloc((arr).item, ((arr).alloc = (newAlloc)) * sizeof(*(arr).item)); \ - } while(0) -#define darray_growalloc(arr, need) do { \ - size_t __need = (need); \ - if (__need > (arr).alloc) \ - darray_realloc(arr, darray_next_alloc((arr).alloc, __need)); \ - } while(0) - -#if HAVE_STATEMENT_EXPR==1 -#define darray_make_room(arr, room) ({size_t newAlloc = (arr).size+(room); if ((arr).alloc &(arr).item[0]; ) - - -#endif /* CCAN_DARRAY_H */ - -/* - -darray_growalloc(arr, newAlloc) sees if the darray can currently hold newAlloc items; - if not, it increases the alloc to satisfy this requirement, allocating slack - space to avoid having to reallocate for every size increment. - -darray_from_string(arr, str) copies a string to an darray_char. - -darray_push(arr, item) pushes an item to the end of the darray. -darray_pop(arr) pops it back out. Be sure there is at least one item in the darray before calling. -darray_pop_check(arr) does the same as darray_pop, but returns NULL if there are no more items left in the darray. - -darray_make_room(arr, room) ensures there's 'room' elements of space after the end of the darray, and it returns a pointer to this space. -Currently requires HAVE_STATEMENT_EXPR, but I plan to remove this dependency by creating an inline function. - -The following require HAVE_TYPEOF==1 : - -darray_appends(arr, item0, item1...) appends a collection of comma-delimited items to the darray. -darray_prepends(arr, item0, item1...) prepends a collection of comma-delimited items to the darray.\ - - -Examples: - - darray(int) arr; - int *i; - - darray_appends(arr, 0,1,2,3,4); - darray_appends(arr, -5,-4,-3,-2,-1); - darray_foreach(i, arr) - printf("%d ", *i); - printf("\n"); - - darray_free(arr); - - - typedef struct {int n,d;} Fraction; - darray(Fraction) fractions; - Fraction *i; - - darray_appends(fractions, {3,4}, {3,5}, {2,1}); - darray_foreach(i, fractions) - printf("%d/%d\n", i->n, i->d); - - darray_free(fractions); -*/ diff --git a/cmd_attr.c b/cmd_attr.c index 736554c..9e7f563 100644 --- a/cmd_attr.c +++ b/cmd_attr.c @@ -87,7 +87,7 @@ static void setattr_usage(void) bch2_opts_usage(OPT_INODE); puts(" -h Display this help and exit\n" - "Report bugs to "); + "Report bugs to "); } int cmd_setattr(int argc, char *argv[]) diff --git a/cmd_data.c b/cmd_data.c index d78598d..160eb91 100644 --- a/cmd_data.c +++ b/cmd_data.c @@ -18,7 +18,7 @@ int data_usage(void) " rereplicate Rereplicate degraded data\n" " job Kick off low level data jobs\n" "\n" - "Report bugs to "); + "Report bugs to "); return 0; } @@ -32,7 +32,7 @@ static void data_rereplicate_usage(void) "\n" "Options:\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } @@ -77,7 +77,7 @@ static void data_job_usage(void) " -s inode:offset start position\n" " -e inode:offset end position\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } diff --git a/cmd_device.c b/cmd_device.c index ef2dfa1..e3c5d51 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -53,7 +53,7 @@ static void device_add_usage(void) " -f, --force Use device even if it appears to already be formatted\n" " -h, --help Display this help and exit\n" "\n" - "Report bugs to "); + "Report bugs to "); } int cmd_device_add(int argc, char *argv[]) @@ -147,7 +147,7 @@ static void device_remove_usage(void) " -F, --force-metadata Force removal, even if some metadata\n" " couldn't be migrated\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } @@ -214,7 +214,7 @@ static void device_online_usage(void) "Options:\n" " -h, --help Display this help and exit\n" "\n" - "Report bugs to "); + "Report bugs to "); } int cmd_device_online(int argc, char *argv[]) @@ -251,7 +251,7 @@ static void device_offline_usage(void) " -f, --force Force, if data redundancy will be degraded\n" " -h, --help Display this help and exit\n" "\n" - "Report bugs to "); + "Report bugs to "); } int cmd_device_offline(int argc, char *argv[]) @@ -295,7 +295,7 @@ static void device_evacuate_usage(void) "Options:\n" " -h, --help Display this help and exit\n" "\n" - "Report bugs to "); + "Report bugs to "); } int cmd_device_evacuate(int argc, char *argv[]) @@ -350,7 +350,7 @@ static void device_set_state_usage(void) " --force-if-data-lost Force, if data will be lost\n" " -o, --offline Set state of an offline device\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } @@ -418,9 +418,12 @@ int cmd_device_set_state(int argc, char *argv[]) le64_add_cpu(&sb.sb->seq, 1); - bch2_super_write(sb.bdev->bd_fd, sb.sb); + bch2_super_write(sb.bdev->bd_buffered_fd, sb.sb); + ret = fsync(sb.bdev->bd_buffered_fd); + if (ret) + fprintf(stderr, "error writing superblock: fsync error (%m)"); bch2_free_super(&sb); - return 0; + return ret; } char *fs_path = arg_pop(); @@ -451,7 +454,7 @@ static void device_resize_usage(void) "\n" "Options:\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } @@ -559,7 +562,7 @@ static void device_resize_journal_usage(void) "\n" "Options:\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } diff --git a/cmd_dump.c b/cmd_dump.c new file mode 100644 index 0000000..4e3d721 --- /dev/null +++ b/cmd_dump.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include + +#include "cmds.h" +#include "libbcachefs.h" +#include "qcow2.h" + +#include "libbcachefs/bcachefs.h" +#include "libbcachefs/btree_cache.h" +#include "libbcachefs/btree_iter.h" +#include "libbcachefs/error.h" +#include "libbcachefs/extents.h" +#include "libbcachefs/super.h" + +static void dump_usage(void) +{ + puts("bcachefs dump - dump filesystem metadata\n" + "Usage: bcachefs dump [OPTION]... \n" + "\n" + "Options:\n" + " -o output Output qcow2 image(s)\n" + " -f Force; overwrite when needed\n" + " -j Dump entire journal, not just dirty entries\n" + " -h Display this help and exit\n" + "Report bugs to "); +} + +static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, + bool entire_journal) +{ + struct bch_sb *sb = ca->disk_sb.sb; + ranges data = { 0 }; + unsigned i; + int ret; + + /* Superblock: */ + range_add(&data, BCH_SB_LAYOUT_SECTOR << 9, + sizeof(struct bch_sb_layout)); + + for (i = 0; i < sb->layout.nr_superblocks; i++) + range_add(&data, + le64_to_cpu(sb->layout.sb_offset[i]) << 9, + vstruct_bytes(sb)); + + /* Journal: */ + for (i = 0; i < ca->journal.nr; i++) + if (entire_journal || + ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) { + u64 bucket = ca->journal.buckets[i]; + + range_add(&data, + bucket_bytes(ca) * bucket, + bucket_bytes(ca)); + } + + /* Btree: */ + for (i = 0; i < BTREE_ID_NR; i++) { + const struct bch_extent_ptr *ptr; + struct bkey_ptrs_c ptrs; + struct btree_trans trans; + struct btree_iter iter; + struct btree *b; + + bch2_trans_init(&trans, c, 0, 0); + + __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { + struct btree_node_iter iter; + struct bkey u; + struct bkey_s_c k; + + for_each_btree_node_key_unpack(b, k, &iter, &u) { + ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == ca->dev_idx) + range_add(&data, + ptr->offset << 9, + btree_bytes(c)); + } + } + + if (ret) + die("error %s walking btree nodes", strerror(-ret)); + + b = c->btree_roots[i].b; + if (!btree_node_fake(b)) { + ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == ca->dev_idx) + range_add(&data, + ptr->offset << 9, + btree_bytes(c)); + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + } + + qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data, + max_t(unsigned, btree_bytes(c) / 8, block_bytes(c))); + darray_exit(&data); +} + +int cmd_dump(int argc, char *argv[]) +{ + struct bch_opts opts = bch2_opts_empty(); + struct bch_dev *ca; + char *out = NULL; + unsigned i, nr_devices = 0; + bool force = false, entire_journal = false; + int fd, opt; + + opt_set(opts, nochanges, true); + opt_set(opts, norecovery, true); + opt_set(opts, degraded, true); + opt_set(opts, errors, BCH_ON_ERROR_continue); + opt_set(opts, fix_errors, FSCK_OPT_NO); + + while ((opt = getopt(argc, argv, "o:fjvh")) != -1) + switch (opt) { + case 'o': + out = optarg; + break; + case 'f': + force = true; + break; + case 'j': + entire_journal = true; + break; + case 'v': + opt_set(opts, verbose, true); + break; + case 'h': + dump_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + if (!out) + die("Please supply output filename"); + + if (!argc) + die("Please supply device(s) to check"); + + struct bch_fs *c = bch2_fs_open(argv, argc, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + + down_read(&c->gc_lock); + + for_each_online_member(ca, c, i) + nr_devices++; + + BUG_ON(!nr_devices); + + for_each_online_member(ca, c, i) { + int flags = O_WRONLY|O_CREAT|O_TRUNC; + + if (!force) + flags |= O_EXCL; + + if (!c->devs[i]) + continue; + + char *path = nr_devices > 1 + ? mprintf("%s.%u.qcow2", out, i) + : mprintf("%s.qcow2", out); + fd = xopen(path, flags, 0600); + free(path); + + dump_one_device(c, ca, fd, entire_journal); + close(fd); + } + + up_read(&c->gc_lock); + + bch2_fs_stop(c); + return 0; +} diff --git a/cmd_format.c b/cmd_format.c index cc16b31..4debc28 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -20,11 +20,10 @@ #include -#include "ccan/darray/darray.h" - #include "cmds.h" #include "libbcachefs.h" #include "crypto.h" +#include "libbcachefs/darray.h" #include "libbcachefs/opts.h" #include "libbcachefs/super-io.h" #include "libbcachefs/util.h" @@ -46,6 +45,7 @@ x(0, version, required_argument) \ x(0, no_initialize, no_argument) \ x('f', force, no_argument) \ x('q', quiet, no_argument) \ +x('v', verbose, no_argument) \ x('h', help, no_argument) static void usage(void) @@ -73,12 +73,13 @@ static void usage(void) "\n" " -f, --force\n" " -q, --quiet Only print errors\n" + " -v, --verbose Verbose filesystem initialization\n" " -h, --help Display this help and exit\n" "\n" "Device specific options must come before corresponding devices, e.g.\n" " bcachefs format --label cache /dev/sdb /dev/sdc\n" "\n" - "Report bugs to "); + "Report bugs to "); } enum { @@ -112,23 +113,20 @@ u64 read_flag_list_or_die(char *opt, const char * const list[], int cmd_format(int argc, char *argv[]) { - darray(struct dev_opts) devices; - darray(char *) device_paths; + DARRAY(struct dev_opts) devices = { 0 }; + DARRAY(char *) device_paths = { 0 }; struct format_opts opts = format_opts_default(); struct dev_opts dev_opts = dev_opts_default(), *dev; - bool force = false, no_passphrase = false, quiet = false, initialize = true; + bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false; unsigned v; int opt; - darray_init(devices); - darray_init(device_paths); - struct bch_opt_strs fs_opt_strs = bch2_cmdline_opts_get(&argc, argv, OPT_FORMAT); struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs); while ((opt = getopt_long(argc, argv, - "-L:U:g:fqh", + "-L:U:g:fqhv", format_opts, NULL)) != -1) switch (opt) { @@ -199,15 +197,17 @@ int cmd_format(int argc, char *argv[]) initialize = false; break; case O_no_opt: - darray_append(device_paths, optarg); + darray_push(&device_paths, optarg); dev_opts.path = optarg; - darray_append(devices, dev_opts); + darray_push(&devices, dev_opts); dev_opts.size = 0; break; case O_quiet: case 'q': quiet = true; break; + case 'v': + verbose = true; case O_help: case 'h': usage(); @@ -218,7 +218,7 @@ int cmd_format(int argc, char *argv[]) break; } - if (darray_empty(devices)) + if (!devices.nr) die("Please supply a device"); if (opts.encrypted && !no_passphrase) { @@ -226,18 +226,26 @@ int cmd_format(int argc, char *argv[]) initialize = false; } - darray_foreach(dev, devices) + darray_for_each(devices, dev) dev->fd = open_for_format(dev->path, force); struct bch_sb *sb = bch2_format(fs_opt_strs, fs_opts, opts, - devices.item, darray_size(devices)); + devices.data, devices.nr); bch2_opt_strs_free(&fs_opt_strs); - if (!quiet) - bch2_sb_print(sb, false, 1 << BCH_SB_FIELD_members, HUMAN_READABLE); + if (!quiet) { + struct printbuf buf = PRINTBUF; + + buf.human_readable_units = true; + + bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members); + printf("%s", buf.buf); + + printbuf_exit(&buf); + } free(sb); if (opts.passphrase) { @@ -245,24 +253,29 @@ int cmd_format(int argc, char *argv[]) free(opts.passphrase); } - darray_free(devices); + darray_exit(&devices); if (initialize) { + struct bch_opts mount_opts = bch2_opts_empty(); + + + opt_set(mount_opts, verbose, verbose); + /* * Start the filesystem once, to allocate the journal and create * the root directory: */ - struct bch_fs *c = bch2_fs_open(device_paths.item, - darray_size(device_paths), - bch2_opts_empty()); + struct bch_fs *c = bch2_fs_open(device_paths.data, + device_paths.nr, + mount_opts); if (IS_ERR(c)) - die("error opening %s: %s", device_paths.item[0], + die("error opening %s: %s", device_paths.data[0], strerror(-PTR_ERR(c))); bch2_fs_stop(c); } - darray_free(device_paths); + darray_exit(&device_paths); return 0; } @@ -276,7 +289,7 @@ static void show_super_usage(void) " -f, --fields=(fields) list of sections to print\n" " -l, --layout print superblock layout\n" " -h, --help display this help and exit\n" - "Report bugs to "); + "Report bugs to "); exit(EXIT_SUCCESS); } @@ -325,7 +338,14 @@ int cmd_show_super(int argc, char *argv[]) if (ret) die("Error opening %s: %s", dev, strerror(-ret)); - bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE); + struct printbuf buf = PRINTBUF; + + buf.human_readable_units = true; + + bch2_sb_to_text(&buf, sb.sb, print_layout, fields); + printf("%s", buf.buf); + bch2_free_super(&sb); + printbuf_exit(&buf); return 0; } diff --git a/cmd_fs.c b/cmd_fs.c index f8c4642..007c8d8 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -4,66 +4,96 @@ #include -#include "ccan/darray/darray.h" - #include "linux/sort.h" #include "libbcachefs/bcachefs_ioctl.h" +#include "libbcachefs/darray.h" #include "libbcachefs/opts.h" #include "cmds.h" #include "libbcachefs.h" -static void print_dev_usage_type(const char *type, - unsigned bucket_size, - u64 buckets, u64 sectors, - enum units units) +static void __dev_usage_type_to_text(struct printbuf *out, + const char *type, + unsigned bucket_size, + u64 buckets, u64 sectors, u64 frag) { - u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL); + prt_printf(out, "%s:", type); + prt_tab(out); + + prt_units_u64(out, sectors << 9); + prt_tab_rjust(out); + + prt_printf(out, "%llu", buckets); + prt_tab_rjust(out); + + if (frag) { + prt_units_u64(out, frag << 9); + prt_tab_rjust(out); + } + prt_newline(out); +} - printf_pad(20, " %s:", type); - printf(" %15s %15llu %15s\n", - pr_units(sectors, units), - buckets, - pr_units(frag, units)); +static void dev_usage_type_to_text(struct printbuf *out, + struct bch_ioctl_dev_usage *u, + enum bch_data_type type) +{ + __dev_usage_type_to_text(out, bch2_data_types[type], + u->bucket_size, + u->d[type].buckets, + u->d[type].sectors, + u->d[type].fragmented); } -static void print_dev_usage(struct bchfs_handle fs, - struct dev_name *d, - enum units units) +static void dev_usage_to_text(struct printbuf *out, + struct bchfs_handle fs, + struct dev_name *d) { struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, d->idx); unsigned i; - printf("\n"); - printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx); - printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]); - - printf("%-20s%16s%16s%16s\n", - "", "data", "buckets", "fragmented"); - - for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) - print_dev_usage_type(bch2_data_types[i], - u.bucket_size, - u.buckets[i], - u.sectors[i], - units); - - print_dev_usage_type("erasure coded", - u.bucket_size, - u.ec_buckets, - u.ec_sectors, - units); - - printf_pad(20, " available:"); - printf(" %15s %15llu\n", - pr_units(u.available_buckets * u.bucket_size, units), - u.available_buckets); - - printf_pad(20, " capacity:"); - printf(" %15s %15llu\n", - pr_units(u.nr_buckets * u.bucket_size, units), - u.nr_buckets); + prt_newline(out); + prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx); + prt_tab(out); + prt_str(out, d->dev ?: "(device not found)"); + prt_tab_rjust(out); + + prt_str(out, bch2_member_states[u.state]); + prt_tab_rjust(out); + + prt_newline(out); + + printbuf_indent_add(out, 2); + prt_tab(out); + + prt_str(out, "data"); + prt_tab_rjust(out); + + prt_str(out, "buckets"); + prt_tab_rjust(out); + + prt_str(out, "fragmented"); + prt_tab_rjust(out); + + prt_newline(out); + + for (i = 0; i < BCH_DATA_NR; i++) + dev_usage_type_to_text(out, &u, i); + __dev_usage_type_to_text(out, "erasure coded", + u.bucket_size, + u.buckets_ec, u.buckets_ec * u.bucket_size, 0); + + prt_str(out, "capacity:"); + prt_tab(out); + + prt_units_u64(out, (u.nr_buckets * u.bucket_size) << 9); + prt_tab_rjust(out); + prt_printf(out, "%llu", u.nr_buckets); + prt_tab_rjust(out); + + printbuf_indent_sub(out, 2); + + prt_newline(out); } static int dev_by_label_cmp(const void *_l, const void *_r) @@ -81,15 +111,16 @@ static struct dev_name *dev_idx_to_name(dev_names *dev_names, unsigned idx) { struct dev_name *dev; - darray_foreach(dev, *dev_names) + darray_for_each(*dev_names, dev) if (dev->idx == idx) return dev; return NULL; } -static void print_replicas_usage(const struct bch_replicas_usage *r, - dev_names *dev_names, enum units units) +static void replicas_usage_to_text(struct printbuf *out, + const struct bch_replicas_usage *r, + dev_names *dev_names) { unsigned i; @@ -113,10 +144,18 @@ static void print_replicas_usage(const struct bch_replicas_usage *r, *d++ = ']'; *d++ = '\0'; - printf_pad(16, "%s: ", bch2_data_types[r->r.data_type]); - printf_pad(16, "%u/%u ", r->r.nr_required, r->r.nr_devs); - printf_pad(32, "%s ", devs); - printf(" %s\n", pr_units(r->sectors, units)); + prt_printf(out, "%s: ", bch2_data_types[r->r.data_type]); + prt_tab(out); + + prt_printf(out, "%u/%u ", r->r.nr_required, r->r.nr_devs); + prt_tab(out); + + prt_printf(out, "%s ", devs); + prt_tab(out); + + prt_units_u64(out, r->sectors << 9); + prt_tab_rjust(out); + prt_newline(out); } #define for_each_usage_replica(_u, _r) \ @@ -125,10 +164,9 @@ static void print_replicas_usage(const struct bch_replicas_usage *r, _r = replicas_usage_next(_r), \ BUG_ON((void *) _r > (void *) (_u)->replicas + (_u)->replica_entries_bytes)) -static void print_fs_usage(const char *path, enum units units) +static void fs_usage_to_text(struct printbuf *out, const char *path) { unsigned i; - char uuid[40]; struct bchfs_handle fs = bcache_fs_open(path); @@ -137,60 +175,102 @@ static void print_fs_usage(const char *path, enum units units) struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs); - uuid_unparse(fs.uuid.b, uuid); - printf("Filesystem %s:\n", uuid); + prt_str(out, "Filesystem: "); + pr_uuid(out, fs.uuid.b); + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 16); - printf("%-20s%12s\n", "Size:", pr_units(u->capacity, units)); - printf("%-20s%12s\n", "Used:", pr_units(u->used, units)); + prt_str(out, "Size:"); + prt_tab(out); + prt_units_u64(out, u->capacity << 9); + prt_tab_rjust(out); + prt_newline(out); - printf("%-20s%12s\n", "Online reserved:", pr_units(u->online_reserved, units)); + prt_str(out, "Used:"); + prt_tab(out); + prt_units_u64(out, u->used << 9); + prt_tab_rjust(out); + prt_newline(out); - printf("\n"); - printf("%-16s%-16s%s\n", "Data type", "Required/total", "Devices"); + prt_str(out, "Online reserved:"); + prt_tab(out); + prt_units_u64(out, u->online_reserved << 9); + prt_tab_rjust(out); + prt_newline(out); + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 18); + printbuf_tabstop_push(out, 18); + + prt_str(out, "Data type"); + prt_tab(out); + + prt_str(out, "Required/total"); + prt_tab(out); + + prt_str(out, "Devices"); + prt_newline(out); for (i = 0; i < BCH_REPLICAS_MAX; i++) { if (!u->persistent_reserved[i]) continue; - printf_pad(16, "%s: ", "reserved"); - printf_pad(16, "%u/%u ", 1, i); - printf_pad(32, "[] "); - printf("%s\n", pr_units(u->persistent_reserved[i], units)); + prt_str(out, "reserved:"); + prt_tab(out); + prt_printf(out, "%u/%u ", 1, i); + prt_tab(out); + prt_str(out, "[] "); + prt_units_u64(out, u->persistent_reserved[i] << 9); + prt_tab_rjust(out); + prt_newline(out); } struct bch_replicas_usage *r; for_each_usage_replica(u, r) if (r->r.data_type < BCH_DATA_user) - print_replicas_usage(r, &dev_names, units); + replicas_usage_to_text(out, r, &dev_names); for_each_usage_replica(u, r) if (r->r.data_type == BCH_DATA_user && r->r.nr_required <= 1) - print_replicas_usage(r, &dev_names, units); + replicas_usage_to_text(out, r, &dev_names); for_each_usage_replica(u, r) if (r->r.data_type == BCH_DATA_user && r->r.nr_required > 1) - print_replicas_usage(r, &dev_names, units); + replicas_usage_to_text(out, r, &dev_names); for_each_usage_replica(u, r) if (r->r.data_type > BCH_DATA_user) - print_replicas_usage(r, &dev_names, units); + replicas_usage_to_text(out, r, &dev_names); free(u); - sort(&darray_item(dev_names, 0), darray_size(dev_names), - sizeof(darray_item(dev_names, 0)), dev_by_label_cmp, NULL); + sort(dev_names.data, dev_names.nr, + sizeof(dev_names.data[0]), dev_by_label_cmp, NULL); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 14); - darray_foreach(dev, dev_names) - print_dev_usage(fs, dev, units); + darray_for_each(dev_names, dev) + dev_usage_to_text(out, fs, dev); - darray_foreach(dev, dev_names) { + darray_for_each(dev_names, dev) { free(dev->dev); free(dev->label); } - darray_free(dev_names); + darray_exit(&dev_names); bcache_fs_close(fs); } @@ -209,24 +289,33 @@ int fs_usage(void) int cmd_fs_usage(int argc, char *argv[]) { - enum units units = BYTES; + bool human_readable = false; + struct printbuf buf = PRINTBUF; char *fs; int opt; while ((opt = getopt(argc, argv, "h")) != -1) switch (opt) { case 'h': - units = HUMAN_READABLE; + human_readable = true; break; } args_shift(optind); if (!argc) { - print_fs_usage(".", units); + printbuf_reset(&buf); + buf.human_readable_units = human_readable; + fs_usage_to_text(&buf, "."); + printf("%s", buf.buf); } else { - while ((fs = arg_pop())) - print_fs_usage(fs, units); + while ((fs = arg_pop())) { + printbuf_reset(&buf); + buf.human_readable_units = human_readable; + fs_usage_to_text(&buf, fs); + printf("%s", buf.buf); + } } + printbuf_exit(&buf); return 0; } diff --git a/cmd_key.c b/cmd_key.c index 6052cb0..63b0541 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -14,20 +14,26 @@ static void unlock_usage(void) "\n" "Options:\n" " -c Check if a device is encrypted\n" + " -k (session|user|user_session)\n" + " Keyring to add to (default: user)\n" " -h Display this help and exit\n" - "Report bugs to "); + "Report bugs to "); } int cmd_unlock(int argc, char *argv[]) { + const char *keyring = "user"; bool check = false; int opt; - while ((opt = getopt(argc, argv, "ch")) != -1) + while ((opt = getopt(argc, argv, "ck:h")) != -1) switch (opt) { case 'c': check = true; break; + case 'k': + keyring = strdup(optarg); + break; case 'h': unlock_usage(); exit(EXIT_SUCCESS); @@ -59,7 +65,7 @@ int cmd_unlock(int argc, char *argv[]) char *passphrase = read_passphrase("Enter passphrase: "); - bch2_add_key(sb.sb, passphrase); + bch2_add_key(sb.sb, "user", keyring, passphrase); bch2_free_super(&sb); memzero_explicit(passphrase, strlen(passphrase)); diff --git a/cmd_debug.c b/cmd_list.c similarity index 57% rename from cmd_debug.c rename to cmd_list.c index 6ff58a9..382153d 100644 --- a/cmd_debug.c +++ b/cmd_list.c @@ -9,187 +9,21 @@ #include "tools-util.h" #include "libbcachefs/bcachefs.h" -#include "libbcachefs/bset.h" #include "libbcachefs/btree_cache.h" #include "libbcachefs/btree_io.h" #include "libbcachefs/btree_iter.h" -#include "libbcachefs/buckets.h" #include "libbcachefs/checksum.h" #include "libbcachefs/error.h" -#include "libbcachefs/journal.h" -#include "libbcachefs/journal_io.h" +#include "libbcachefs/extents.h" #include "libbcachefs/super.h" -static void dump_usage(void) -{ - puts("bcachefs dump - dump filesystem metadata\n" - "Usage: bcachefs dump [OPTION]... \n" - "\n" - "Options:\n" - " -o output Output qcow2 image(s)\n" - " -f Force; overwrite when needed\n" - " -h Display this help and exit\n" - "Report bugs to "); -} - -static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) -{ - struct bch_sb *sb = ca->disk_sb.sb; - ranges data; - unsigned i; - int ret; - - darray_init(data); - - /* Superblock: */ - range_add(&data, BCH_SB_LAYOUT_SECTOR << 9, - sizeof(struct bch_sb_layout)); - - for (i = 0; i < sb->layout.nr_superblocks; i++) - range_add(&data, - le64_to_cpu(sb->layout.sb_offset[i]) << 9, - vstruct_bytes(sb)); - - /* Journal: */ - for (i = 0; i < ca->journal.nr; i++) - if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) { - u64 bucket = ca->journal.buckets[i]; - - range_add(&data, - bucket_bytes(ca) * bucket, - bucket_bytes(ca)); - } - - /* Btree: */ - for (i = 0; i < BTREE_ID_NR; i++) { - const struct bch_extent_ptr *ptr; - struct bkey_ptrs_c ptrs; - struct btree_trans trans; - struct btree_iter iter; - struct btree *b; - - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { - struct btree_node_iter iter; - struct bkey u; - struct bkey_s_c k; - - for_each_btree_node_key_unpack(b, k, &iter, &u) { - ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == ca->dev_idx) - range_add(&data, - ptr->offset << 9, - btree_bytes(c)); - } - } - - if (ret) - die("error %s walking btree nodes", strerror(-ret)); - - b = c->btree_roots[i].b; - if (!btree_node_fake(b)) { - ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == ca->dev_idx) - range_add(&data, - ptr->offset << 9, - btree_bytes(c)); - } - - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - } - - qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data, - max_t(unsigned, btree_bytes(c) / 8, block_bytes(c))); - darray_free(data); -} - -int cmd_dump(int argc, char *argv[]) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_dev *ca; - char *out = NULL; - unsigned i, nr_devices = 0; - bool force = false; - int fd, opt; - - opt_set(opts, nochanges, true); - opt_set(opts, norecovery, true); - opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_continue); - opt_set(opts, fix_errors, FSCK_OPT_NO); - - while ((opt = getopt(argc, argv, "o:fvh")) != -1) - switch (opt) { - case 'o': - out = optarg; - break; - case 'f': - force = true; - break; - case 'v': - opt_set(opts, verbose, true); - break; - case 'h': - dump_usage(); - exit(EXIT_SUCCESS); - } - args_shift(optind); - - if (!out) - die("Please supply output filename"); - - if (!argc) - die("Please supply device(s) to check"); - - struct bch_fs *c = bch2_fs_open(argv, argc, opts); - if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); - - down_read(&c->gc_lock); - - for_each_online_member(ca, c, i) - nr_devices++; - - BUG_ON(!nr_devices); - - for_each_online_member(ca, c, i) { - int flags = O_WRONLY|O_CREAT|O_TRUNC; - - if (!force) - flags |= O_EXCL; - - if (!c->devs[i]) - continue; - - char *path = nr_devices > 1 - ? mprintf("%s.%u", out, i) - : strdup(out); - fd = xopen(path, flags, 0600); - free(path); - - dump_one_device(c, ca, fd); - close(fd); - } - - up_read(&c->gc_lock); - - bch2_fs_stop(c); - return 0; -} - static void list_keys(struct bch_fs *c, enum btree_id btree_id, struct bpos start, struct bpos end) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - char buf[512]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -200,12 +34,15 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, if (bkey_cmp(k.k->p, end) > 0) break; - bch2_bkey_val_to_text(&PBUF(buf), c, k); - puts(buf); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); + puts(buf.buf); } bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); + + printbuf_exit(&buf); } static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigned level, @@ -214,7 +51,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne struct btree_trans trans; struct btree_iter iter; struct btree *b; - char buf[4096]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -223,8 +60,9 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne if (bkey_cmp(b->key.k.p, end) > 0) break; - bch2_btree_node_to_text(&PBUF(buf), c, b); - puts(buf); + printbuf_reset(&buf); + bch2_btree_node_to_text(&buf, c, b); + puts(buf.buf); } bch2_trans_iter_exit(&trans, &iter); @@ -232,6 +70,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); + printbuf_exit(&buf); } static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, @@ -240,7 +79,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct btree_trans trans; struct btree_iter iter; struct btree *b; - char buf[4096]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -249,8 +88,9 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, if (bkey_cmp(b->key.k.p, end) > 0) break; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); - fputs(buf, stdout); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + fputs(buf.buf, stdout); putchar('\n'); } bch2_trans_iter_exit(&trans, &iter); @@ -259,6 +99,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); + printbuf_exit(&buf); } static void print_node_ondisk(struct bch_fs *c, struct btree *b) @@ -268,6 +109,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) struct bch_dev *ca; struct bio *bio; unsigned offset = 0; + int ret; if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { printf("error getting device to read from\n"); @@ -280,17 +122,19 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) return; } - n_ondisk = malloc(btree_bytes(c)); + n_ondisk = aligned_alloc(block_bytes(c), btree_bytes(c)); - bio = bio_alloc_bioset(GFP_NOIO, - buf_pages(n_ondisk, btree_bytes(c)), - &c->btree_bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_READ|REQ_META; + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_ondisk, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOIO, + &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; bch2_bio_map(bio, n_ondisk, btree_bytes(c)); - submit_bio_wait(bio); + ret = submit_bio_wait(bio); + if (ret) + die("error reading btree node: %i", ret); bio_put(bio); percpu_ref_put(&ca->io_ref); @@ -306,7 +150,8 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) i = &n_ondisk->keys; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - die("unknown checksum type"); + die("unknown checksum type at offset %u: %llu", + offset, BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, offset << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); @@ -326,7 +171,8 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) break; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - die("unknown checksum type"); + die("unknown checksum type at offset %u: %llu", + offset, BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, offset << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); @@ -347,10 +193,14 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) { struct bkey u; - char buf[4096]; + struct printbuf buf = PRINTBUF; + + printbuf_indent_add(&buf, 4); + + bch2_bkey_val_to_text(&buf, c, bkey_disassemble(b, k, &u)); + fprintf(stdout, "%s\n", buf.buf); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_disassemble(b, k, &u)); - fprintf(stdout, " %s\n", buf); + printbuf_exit(&buf); } } @@ -363,7 +213,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned struct btree_trans trans; struct btree_iter iter; struct btree *b; - char buf[4096]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -372,8 +222,9 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned if (bkey_cmp(b->key.k.p, end) > 0) break; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); - fputs(buf, stdout); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + fputs(buf.buf, stdout); putchar('\n'); print_node_ondisk(c, b); @@ -384,6 +235,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); + printbuf_exit(&buf); } static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned level, @@ -395,7 +247,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l struct bkey unpacked; struct bkey_s_c k; struct btree *b; - char buf[4096]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -404,13 +256,15 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l if (bkey_cmp(b->key.k.p, end) > 0) break; - bch2_btree_node_to_text(&PBUF(buf), c, b); - fputs(buf, stdout); + printbuf_reset(&buf); + bch2_btree_node_to_text(&buf, c, b); + fputs(buf.buf, stdout); for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) { - bch2_bkey_val_to_text(&PBUF(buf), c, k); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); putchar('\t'); - puts(buf); + puts(buf.buf); } } bch2_trans_iter_exit(&trans, &iter); @@ -419,6 +273,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); + printbuf_exit(&buf); } static void list_keys_usage(void) @@ -437,7 +292,7 @@ static void list_keys_usage(void) " -f Check (fsck) the filesystem first\n" " -v Verbose mode\n" " -h Display this help and exit\n" - "Report bugs to "); + "Report bugs to "); } #define LIST_MODES() \ @@ -551,70 +406,3 @@ int cmd_list(int argc, char *argv[]) bch2_fs_stop(c); return 0; } - -static void list_journal_usage(void) -{ - puts("bcachefs list_journal - print contents of journal\n" - "Usage: bcachefs list_journal [OPTION]... \n" - "\n" - "Options:\n" - " -a Read entire journal, not just dirty entries\n" - " -h Display this help and exit\n" - "Report bugs to "); -} - -int cmd_list_journal(int argc, char *argv[]) -{ - struct bch_opts opts = bch2_opts_empty(); - int opt; - - opt_set(opts, nochanges, true); - opt_set(opts, norecovery, true); - opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_continue); - opt_set(opts, fix_errors, FSCK_OPT_YES); - opt_set(opts, keep_journal, true); - - while ((opt = getopt(argc, argv, "ah")) != -1) - switch (opt) { - case 'a': - opt_set(opts, read_entire_journal, true); - break; - case 'h': - list_journal_usage(); - exit(EXIT_SUCCESS); - } - args_shift(optind); - - if (!argc) - die("Please supply device(s) to open"); - - struct bch_fs *c = bch2_fs_open(argv, argc, opts); - if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); - - struct journal_replay *p; - struct jset_entry *entry; - - list_for_each_entry(p, &c->journal_entries, list) { - printf("journal entry %8llu\n" - " version %8u\n" - " last seq %8llu\n" - , - le64_to_cpu(p->j.seq), - le32_to_cpu(p->j.version), - le64_to_cpu(p->j.last_seq)); - - vstruct_for_each(&p->j, entry) { - char _buf[4096]; - struct printbuf buf = PBUF(_buf); - - printbuf_indent_push(&buf, 2); - bch2_journal_entry_to_text(&buf, c, entry); - printf("%s\n", _buf); - } - } - - bch2_fs_stop(c); - return 0; -} diff --git a/cmd_list_journal.c b/cmd_list_journal.c new file mode 100644 index 0000000..869d334 --- /dev/null +++ b/cmd_list_journal.c @@ -0,0 +1,246 @@ +#include +#include +#include +#include + +#include "cmds.h" +#include "libbcachefs.h" +#include "qcow2.h" +#include "tools-util.h" + +#include "libbcachefs/bcachefs.h" +#include "libbcachefs/btree_iter.h" +#include "libbcachefs/error.h" +#include "libbcachefs/journal_io.h" +#include "libbcachefs/journal_seq_blacklist.h" +#include "libbcachefs/super.h" + +static void list_journal_usage(void) +{ + puts("bcachefs list_journal - print contents of journal\n" + "Usage: bcachefs list_journal [OPTION]... \n" + "\n" + "Options:\n" + " -a Read entire journal, not just dirty entries\n" + " -n Number of journal entries to print, starting from the most recent\n" + " -v Verbose mode\n" + " -h Display this help and exit\n" + "Report bugs to "); +} + +static void star_start_of_lines(char *buf) +{ + char *p = buf; + + if (*p == ' ') + *p = '*'; + + while ((p = strstr(p, "\n "))) + p[1] = '*'; +} + +int cmd_list_journal(int argc, char *argv[]) +{ + struct bch_opts opts = bch2_opts_empty(); + u32 nr_entries = U32_MAX; + int opt; + + opt_set(opts, nochanges, true); + opt_set(opts, norecovery, true); + opt_set(opts, degraded, true); + opt_set(opts, errors, BCH_ON_ERROR_continue); + opt_set(opts, fix_errors, FSCK_OPT_YES); + opt_set(opts, keep_journal, true); + opt_set(opts, read_journal_only,true); + + while ((opt = getopt(argc, argv, "an:vh")) != -1) + switch (opt) { + case 'a': + opt_set(opts, read_entire_journal, true); + break; + case 'n': + nr_entries = kstrtouint(optarg, 10, &nr_entries); + opt_set(opts, read_entire_journal, true); + break; + case 'v': + opt_set(opts, verbose, true); + break; + case 'h': + list_journal_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + if (!argc) + die("Please supply device(s) to open"); + + struct bch_fs *c = bch2_fs_open(argv, argc, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + + struct journal_replay *p, **_p; + struct genradix_iter iter; + struct jset_entry *entry; + struct printbuf buf = PRINTBUF; + + genradix_for_each(&c->journal_entries, iter, _p) { + p = *_p; + if (!p) + continue; + + if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq)) + continue; + + bool blacklisted = + bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(p->j.seq), false); + + if (blacklisted) + printf("blacklisted "); + + printf("journal entry %llu\n", le64_to_cpu(p->j.seq)); + + printbuf_reset(&buf); + + prt_printf(&buf, + " version %u\n" + " last seq %llu\n" + " flush %u\n" + " written at ", + le32_to_cpu(p->j.version), + le64_to_cpu(p->j.last_seq), + !JSET_NO_FLUSH(&p->j)); + bch2_journal_ptrs_to_text(&buf, c, p); + + if (blacklisted) + star_start_of_lines(buf.buf); + printf("%s\n", buf.buf); + + vstruct_for_each(&p->j, entry) { + printbuf_reset(&buf); + + /* + * log entries denote the start of a new transaction + * commit: + */ + if (entry->type == BCH_JSET_ENTRY_log && !entry->level) + prt_newline(&buf); + printbuf_indent_add(&buf, 4); + bch2_journal_entry_to_text(&buf, c, entry); + + if (blacklisted) + star_start_of_lines(buf.buf); + printf("%s\n", buf.buf); + } + } + + printbuf_exit(&buf); + bch2_fs_stop(c); + return 0; +} + +static void kill_btree_node_usage(void) +{ + puts("bcachefs kill_btree_node - make btree nodes unreadable\n" + "Usage: bcachefs kill_btree_node [OPTION]... \n" + "\n" + "Options:\n" + " -b (extents|inodes|dirents|xattrs) Btree to delete from\n" + " -l level Levle to delete from (0 == leaves)\n" + " -i index Index of btree node to kill\n" + " -h Display this help and exit\n" + "Report bugs to "); +} + +int cmd_kill_btree_node(int argc, char *argv[]) +{ + struct bch_opts opts = bch2_opts_empty(); + enum btree_id btree_id = 0; + unsigned level = 0; + u64 node_index = 0; + int opt; + + opt_set(opts, read_only, true); + + while ((opt = getopt(argc, argv, "b:l:i:h")) != -1) + switch (opt) { + case 'b': + btree_id = read_string_list_or_die(optarg, + bch2_btree_ids, "btree id"); + break; + case 'l': + if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH) + die("invalid level"); + break; + case 'i': + if (kstrtoull(optarg, 10, &node_index)) + die("invalid index %s", optarg); + break; + case 'h': + kill_btree_node_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + if (!argc) + die("Please supply device(s)"); + + struct bch_fs *c = bch2_fs_open(argv, argc, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + + struct btree_trans trans; + struct btree_iter iter; + struct btree *b; + int ret; + void *zeroes; + + ret = posix_memalign(&zeroes, c->opts.block_size, c->opts.block_size); + if (ret) + die("error %s from posix_memalign", strerror(ret)); + + bch2_trans_init(&trans, c, 0, 0); + + __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) { + if (b->c.level != level) + continue; + + if (!node_index) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); + const struct bch_extent_ptr *ptr; + + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch_info(c, "killing btree node %s", buf.buf); + printbuf_exit(&buf); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + ret = pwrite(ca->disk_sb.bdev->bd_fd, zeroes, + c->opts.block_size, ptr->offset << 9); + if (ret != c->opts.block_size) { + bch_err(c, "pwrite error: expected %u got %i %s", + c->opts.block_size, ret, strerror(errno)); + ret = EXIT_FAILURE; + goto done; + } + } + goto done; + } + + node_index--; + } + if (ret) + bch_err(c, "error %i walking btree nodes", ret); + else + bch_err(c, "node at specified index not found"); + ret = EXIT_FAILURE; +done: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + bch2_fs_stop(c); + return ret; +} diff --git a/cmd_migrate.c b/cmd_migrate.c index 4da3ab1..3ba51c0 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -122,7 +122,7 @@ static void update_inode(struct bch_fs *c, struct bkey_inode_buf packed; int ret; - bch2_inode_pack(c, &packed, inode); + bch2_inode_pack(&packed, inode); packed.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, NULL, NULL, 0); @@ -257,7 +257,7 @@ static void write_data(struct bch_fs *c, closure_init_stack(&cl); - bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv)); + bio_init(&op.wbio.bio, NULL, bv, ARRAY_SIZE(bv), 0); bch2_bio_map(&op.wbio.bio, buf, len); bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts)); @@ -530,7 +530,7 @@ static ranges reserve_new_fs_space(const char *file_path, unsigned block_size, struct fiemap_iter iter; struct fiemap_extent e; - ranges extents = { NULL }; + ranges extents = { 0 }; fiemap_for_each(fd, iter, e) { if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN| @@ -603,7 +603,7 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, update_inode(c, &root_inode); - darray_free(s.extents); + darray_exit(&s.extents); genradix_free(&s.hardlinks); } @@ -613,7 +613,7 @@ static void find_superblock_space(ranges extents, { struct range *i; - darray_foreach(i, extents) { + darray_for_each(extents, i) { u64 start = round_up(max(256ULL << 10, i->start), dev->bucket_size << 9); u64 end = round_down(i->end, @@ -641,7 +641,7 @@ static void migrate_usage(void) " --no_passphrase Don't encrypt master encryption key\n" " -F Force, even if metadata file already exists\n" " -h Display this help and exit\n" - "Report bugs to "); + "Report bugs to "); } static const struct option migrate_opts[] = { @@ -691,7 +691,7 @@ static int migrate_fs(const char *fs_path, u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]); if (format_opts.passphrase) - bch2_add_key(sb, format_opts.passphrase); + bch2_add_key(sb, "user", "user", format_opts.passphrase); free(sb); @@ -799,7 +799,7 @@ static void migrate_superblock_usage(void) " -d device Device to create superblock for\n" " -o offset Offset of existing superblock\n" " -h Display this help and exit\n" - "Report bugs to "); + "Report bugs to "); } int cmd_migrate_superblock(int argc, char *argv[]) diff --git a/cmd_option.c b/cmd_option.c new file mode 100644 index 0000000..86768e5 --- /dev/null +++ b/cmd_option.c @@ -0,0 +1,106 @@ +/* + * Authors: Kent Overstreet + * + * GPLv2 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cmds.h" +#include "libbcachefs.h" +#include "libbcachefs/opts.h" +#include "libbcachefs/super-io.h" + +static void set_option_usage(void) +{ + puts("bcachefs set-option \n" + "Usage: bcachefs set-option [OPTION].. device\n" + "\n" + "Options:\n"); + bch2_opts_usage(OPT_MOUNT); + puts(" -h, --help display this help and exit\n" + "Report bugs to "); + exit(EXIT_SUCCESS); +} + +int cmd_set_option(int argc, char *argv[]) +{ + struct bch_opt_strs new_opt_strs = bch2_cmdline_opts_get(&argc, argv, OPT_MOUNT); + struct bch_opts new_opts = bch2_parse_opts(new_opt_strs); + struct bch_opts open_opts = bch2_opts_empty(); + unsigned i; + int opt, ret = 0; + + opt_set(open_opts, nostart, true); + + while ((opt = getopt(argc, argv, "h")) != -1) + switch (opt) { + case 'h': + set_option_usage(); + break; + } + args_shift(optind); + + if (!argc) { + fprintf(stderr, "Please supply device(s)\n"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < argc; i++) + if (dev_mounted(argv[i])) + goto online; + + struct bch_fs *c = bch2_fs_open(argv, argc, open_opts); + if (IS_ERR(c)) { + fprintf(stderr, "error opening %s: %s\n", argv[0], strerror(-PTR_ERR(c))); + exit(EXIT_FAILURE); + } + + for (i = 0; i < bch2_opts_nr; i++) { + u64 v = bch2_opt_get_by_id(&new_opts, i); + + if (!bch2_opt_defined_by_id(&new_opts, i)) + continue; + + ret = bch2_opt_check_may_set(c, i, v); + if (ret < 0) { + fprintf(stderr, "error setting %s: %i\n", + bch2_opt_table[i].attr.name, ret); + break; + } + + bch2_opt_set_sb(c, bch2_opt_table + i, v); + bch2_opt_set_by_id(&c->opts, i, v); + } + + bch2_fs_stop(c); + return ret; +online: + { + unsigned dev_idx; + struct bchfs_handle fs = bchu_fs_open_by_dev(argv[i], &dev_idx); + + for (i = 0; i < bch2_opts_nr; i++) { + if (!new_opt_strs.by_id[i]) + continue; + + char *path = mprintf("options/%s", bch2_opt_table[i].attr.name); + + write_file_str(fs.sysfs_fd, path, new_opt_strs.by_id[i]); + free(path); + } + } + return 0; +} diff --git a/cmds.h b/cmds.h index 52db63f..c18a87f 100644 --- a/cmds.h +++ b/cmds.h @@ -11,6 +11,7 @@ int cmd_format(int argc, char *argv[]); int cmd_show_super(int argc, char *argv[]); +int cmd_set_option(int argc, char *argv[]); #if 0 int cmd_assemble(int argc, char *argv[]); @@ -45,6 +46,7 @@ int cmd_fsck(int argc, char *argv[]); int cmd_dump(int argc, char *argv[]); int cmd_list(int argc, char *argv[]); int cmd_list_journal(int argc, char *argv[]); +int cmd_kill_btree_node(int argc, char *argv[]); int cmd_migrate(int argc, char *argv[]); int cmd_migrate_superblock(int argc, char *argv[]); diff --git a/crypto.c b/crypto.c index 43753a3..4e4d15a 100644 --- a/crypto.c +++ b/crypto.c @@ -133,10 +133,23 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase, die("incorrect passphrase"); } -void bch2_add_key(struct bch_sb *sb, const char *passphrase) +void bch2_add_key(struct bch_sb *sb, + const char *type, + const char *keyring_str, + const char *passphrase) { struct bch_key passphrase_key; struct bch_encrypted_key sb_key; + int keyring; + + if (!strcmp(keyring_str, "session")) + keyring = KEY_SPEC_SESSION_KEYRING; + else if (!strcmp(keyring_str, "user")) + keyring = KEY_SPEC_USER_KEYRING; + else if (!strcmp(keyring_str, "user_session")) + keyring = KEY_SPEC_USER_SESSION_KEYRING; + else + die("unknown keyring %s", keyring_str); bch2_passphrase_check(sb, passphrase, &passphrase_key, @@ -147,12 +160,10 @@ void bch2_add_key(struct bch_sb *sb, const char *passphrase) char *description = mprintf("bcachefs:%s", uuid); - if (add_key("logon", description, - &passphrase_key, sizeof(passphrase_key), - KEY_SPEC_USER_KEYRING) < 0 || - add_key("user", description, + if (add_key(type, + description, &passphrase_key, sizeof(passphrase_key), - KEY_SPEC_USER_KEYRING) < 0) + keyring) < 0) die("add_key error: %m"); memzero_explicit(description, strlen(description)); diff --git a/crypto.h b/crypto.h index 7f523c0..baea6d8 100644 --- a/crypto.h +++ b/crypto.h @@ -15,7 +15,7 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *, const char *); bool bch2_sb_is_encrypted(struct bch_sb *); void bch2_passphrase_check(struct bch_sb *, const char *, struct bch_key *, struct bch_encrypted_key *); -void bch2_add_key(struct bch_sb *, const char *); +void bch2_add_key(struct bch_sb *, const char *, const char *, const char *); void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *, const char *); diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst new file mode 100644 index 0000000..483b961 --- /dev/null +++ b/debian/bcachefs-tools.postinst @@ -0,0 +1,12 @@ +#!/bin/sh + +set -e + +case "$1" in + configure) + if which update-initramfs >/dev/null; then + update-initramfs -u + fi + ;; +esac + diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm new file mode 100644 index 0000000..6b6fe8a --- /dev/null +++ b/debian/bcachefs-tools.postrm @@ -0,0 +1,12 @@ +#!/bin/sh + +set -e + +case "$1" in + remove) + if which update-initramfs >/dev/null; then + update-initramfs -u + fi + ;; +esac + diff --git a/debian/changelog b/debian/changelog index 3cb0882..a5c17b2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +bcachefs-tools (23-1) unstable; urgency=medium + + * New upstream release + * Update standards version to 4.6.1 + + -- Jonathan Carter Mon, 31 Oct 2022 11:45:25 +0200 + bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium * New upstream snapshot diff --git a/debian/control b/debian/control index 3a9d3aa..0ece553 100644 --- a/debian/control +++ b/debian/control @@ -2,7 +2,7 @@ Source: bcachefs-tools Maintainer: Jonathan Carter Section: utils Priority: optional -Standards-Version: 4.6.0 +Standards-Version: 4.6.1 Rules-Requires-Root: no Build-Depends: debhelper-compat (= 13), pkg-config, diff --git a/debian/files b/debian/files index 2ea4bfc..1af54c8 100644 --- a/debian/files +++ b/debian/files @@ -1 +1 @@ -bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional +bcachefs-tools_23-1_source.buildinfo utils optional diff --git a/default.nix b/default.nix index 48f2aa9..a693194 100644 --- a/default.nix +++ b/default.nix @@ -1,6 +1,5 @@ { lib -, filter - +, doCheck ? true , stdenv , pkg-config , attr @@ -20,8 +19,7 @@ , docutils , nixosTests -, lastModified -, versionString ? lastModified +, versionString ? "0.1" , inShell ? false , debugMode ? inShell @@ -39,20 +37,8 @@ stdenv.mkDerivation { version = "v0.1-flake-${versionString}"; VERSION = "v0.1-flake-${versionString}"; - - src = filter.filter { - name = "bcachefs-tools"; - root = ./.; - exclude = [ - ./rust-src - - ./.git - ./nix - - ./flake.nix - ./flake.lock - ]; - }; + + src = (lib.cleanSource (builtins.path { name = "bcachefs-tools-src"; path = ./. ;} )); postPatch = "patchShebangs --build doc/macro2rst.py"; @@ -95,7 +81,7 @@ stdenv.mkDerivation { "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools" ]; - doCheck = true; # needs bcachefs module loaded on builder + doCheck = doCheck; # needs bcachefs module loaded on builder checkInputs = [ python39Packages.pytest @@ -116,7 +102,7 @@ stdenv.mkDerivation { rm tests/test_fuse.py ''; - dontStrip = debugMode == true; + dontStrip = debugMode; passthru = { bcachefs_revision = let file = builtins.readFile ./.bcachefs_revision; diff --git a/include/linux/bio.h b/include/linux/bio.h index cdbbcb3..0ad5a87 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -212,23 +212,19 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, struct bio_set { unsigned int front_pad; + unsigned int back_pad; + mempool_t bio_pool; + mempool_t bvec_pool; }; -static inline void bioset_exit(struct bio_set *bs) {} static inline void bioset_free(struct bio_set *bs) { kfree(bs); } -static inline int bioset_init(struct bio_set *bs, - unsigned pool_size, - unsigned front_pad, - int flags) -{ - bs->front_pad = front_pad; - return 0; -} +void bioset_exit(struct bio_set *); +int bioset_init(struct bio_set *, unsigned, unsigned, int); extern struct bio_set *bioset_create(unsigned int, unsigned int); extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); @@ -237,31 +233,22 @@ enum { BIOSET_NEED_RESCUER = 1 << 1, }; -extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); +struct bio *bio_alloc_bioset(struct block_device *, unsigned, + unsigned, gfp_t, struct bio_set *); extern void bio_put(struct bio *); int bio_add_page(struct bio *, struct page *, unsigned, unsigned); -extern void __bio_clone_fast(struct bio *, struct bio *); -extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); -extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); - -static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - -static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, NULL); +struct bio *bio_alloc_clone(struct block_device *, struct bio *, + gfp_t, struct bio_set *); -} +struct bio *bio_kmalloc(unsigned int, gfp_t); extern void bio_endio(struct bio *); extern void bio_advance(struct bio *, unsigned); -extern void bio_reset(struct bio *); +extern void bio_reset(struct bio *, struct block_device *, unsigned); void bio_chain(struct bio *, struct bio *); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, @@ -428,20 +415,15 @@ static inline void bio_inc_remaining(struct bio *bio) atomic_inc(&bio->__bi_remaining); } -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - -static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, NULL); -} - -static inline void bio_init(struct bio *bio, struct bio_vec *table, - unsigned short max_vecs) +static inline void bio_init(struct bio *bio, + struct block_device *bdev, + struct bio_vec *table, + unsigned short max_vecs, + unsigned int opf) { memset(bio, 0, sizeof(*bio)); + bio->bi_bdev = bdev; + bio->bi_opf = opf; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2fe736e..62a3f40 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -137,6 +137,11 @@ static inline unsigned long hweight64(u64 w) __builtin_popcount(w >> 32); } +static inline unsigned long hweight32(u32 w) +{ + return __builtin_popcount(w); +} + static inline unsigned long hweight8(unsigned long w) { return __builtin_popcountl(w); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index be736c8..22bae25 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -40,6 +40,7 @@ struct block_device { struct gendisk __bd_disk; int bd_fd; int bd_sync_fd; + int bd_buffered_fd; }; #define bdev_kobj(_bdev) (&((_bdev)->kobj)) @@ -65,6 +66,8 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_AGAIN ((__force blk_status_t)12) +#define BIO_INLINE_VECS 4 + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4ce43b5..01b3d4a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -69,8 +69,7 @@ static inline void submit_bio(struct bio *bio) generic_make_request(bio); } -int blkdev_issue_discard(struct block_device *, sector_t, - sector_t, gfp_t, unsigned long); +int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t); #define bdev_get_queue(bdev) (&((bdev)->queue)) @@ -85,7 +84,7 @@ int blkdev_issue_discard(struct block_device *, sector_t, #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) #define SECTOR_MASK (PAGE_SECTORS - 1) -#define blk_queue_discard(q) ((void) (q), 0) +#define bdev_max_discard_sectors(bdev) ((void) (bdev), 0) #define blk_queue_nonrot(q) ((void) (q), 0) unsigned bdev_logical_block_size(struct block_device *bdev); diff --git a/include/linux/bug.h b/include/linux/bug.h index 77260f3..1a10f7e 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -2,6 +2,7 @@ #define __TOOLS_LINUX_BUG_H #include +#include #include #ifdef CONFIG_VALGRIND @@ -17,7 +18,7 @@ #define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)])) -#define BUG() do { assert(0); unreachable(); } while (0) +#define BUG() do { fflush(stdout); assert(0); unreachable(); } while (0) #define BUG_ON(cond) assert(!(cond)) #define WARN(cond, fmt, ...) \ diff --git a/include/linux/errname.h b/include/linux/errname.h new file mode 100644 index 0000000..443d504 --- /dev/null +++ b/include/linux/errname.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_ERRNAME_H +#define _LINUX_ERRNAME_H + +#include + +static inline const char *errname(int err) +{ + return strerror(abs(err)); +} + +#endif /* _LINUX_ERRNAME_H */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h index a29d156..cf485d7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -4,6 +4,7 @@ #define try_to_freeze() #define set_freezable() #define freezing(task) false -#define freezable_schedule_timeout(_t) schedule_timeout(_t); +#define freezable_schedule() schedule() +#define freezable_schedule_timeout(_t) schedule_timeout(_t) #endif /* __TOOLS_LINUX_FREEZER_H */ diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index f09689d..c74b737 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -2,7 +2,7 @@ #define _LINUX_GENERIC_RADIX_TREE_H /** - * DOC: Generic radix trees/sparse arrays: + * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to * PAGE_SIZE. @@ -38,13 +38,15 @@ #include #include -#include +#include #include +#include +#include struct genradix_root; struct __genradix { - struct genradix_root __rcu *root; + struct genradix_root *root; }; /* @@ -115,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) +#define __genradix_objs_per_page(_radix) \ + (PAGE_SIZE / sizeof((_radix)->type[0])) +#define __genradix_page_remainder(_radix) \ + (PAGE_SIZE % sizeof((_radix)->type[0])) + #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -178,14 +185,30 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); #define genradix_iter_peek(_iter, _radix) \ (__genradix_cast(_radix) \ __genradix_iter_peek(_iter, &(_radix)->tree, \ - PAGE_SIZE / __genradix_obj_size(_radix))) + __genradix_objs_per_page(_radix))) + +void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, + size_t, size_t); + +/** + * genradix_iter_peek - get first entry at or below iterator's current + * position + * @_iter: a genradix_iter + * @_radix: genradix being iterated over + * + * If no more entries exist at or below @_iter's current position, returns NULL + */ +#define genradix_iter_peek_prev(_iter, _radix) \ + (__genradix_cast(_radix) \ + __genradix_iter_peek_prev(_iter, &(_radix)->tree, \ + __genradix_objs_per_page(_radix), \ + __genradix_obj_size(_radix) + \ + __genradix_page_remainder(_radix))) static inline void __genradix_iter_advance(struct genradix_iter *iter, size_t obj_size) { - size_t new_offset = iter->offset + obj_size; - - if (new_offset < iter->offset) { + if (iter->offset + obj_size < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return; @@ -203,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, #define genradix_iter_advance(_iter, _radix) \ __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) +static inline void __genradix_iter_rewind(struct genradix_iter *iter, + size_t obj_size) +{ + if (iter->offset == 0 || + iter->offset == SIZE_MAX) { + iter->offset = SIZE_MAX; + return; + } + + if ((iter->offset & (PAGE_SIZE - 1)) == 0) + iter->offset -= PAGE_SIZE % obj_size; + + iter->offset -= obj_size; + iter->pos--; +} + +#define genradix_iter_rewind(_iter, _radix) \ + __genradix_iter_rewind(_iter, __genradix_obj_size(_radix)) + #define genradix_for_each_from(_radix, _iter, _p, _start) \ for (_iter = genradix_iter_init(_radix, _start); \ (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ @@ -220,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, #define genradix_for_each(_radix, _iter, _p) \ genradix_for_each_from(_radix, _iter, _p, 0) +#define genradix_last_pos(_radix) \ + (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + +/** + * genradix_for_each_reverse - iterate over entry in a genradix, reverse order + * @_radix: genradix to iterate over + * @_iter: a genradix_iter to track current position + * @_p: pointer to genradix entry type + * + * On every iteration, @_p will point to the current entry, and @_iter.pos + * will be the current entry's index. + */ +#define genradix_for_each_reverse(_radix, _iter, _p) \ + for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\ + (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\ + genradix_iter_rewind(&_iter, _radix)) + int __genradix_prealloc(struct __genradix *, size_t, gfp_t); /** diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fe92826..4fd3b68 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -43,6 +43,8 @@ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) +#define time_is_before_jiffies(a) time_after(jiffies, a) + #define HZ 1000 static inline u64 jiffies_to_nsecs(const unsigned long j) @@ -79,6 +81,11 @@ static inline u64 local_clock(void) return sched_clock(); } +static inline u64 ktime_get_ns(void) +{ + return sched_clock(); +} + #define jiffies nsecs_to_jiffies(sched_clock()) #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 30451cb..d31b5f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -228,6 +228,17 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 * return kstrtoint(s, base, res); } +struct printbuf; +extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args); +extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...); + +static const char hex_asc[] = "0123456789abcdef"; +#define hex_asc_lo(x) hex_asc[((x) & 0x0f)] +#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] +static const char hex_asc_upper[] = "0123456789ABCDEF"; +#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] +#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] + /* The hash is always the low bits of hash_len */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define HASH_LEN_DECLARE u32 hash; u32 len diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h new file mode 100644 index 0000000..6a3cd1b --- /dev/null +++ b/include/linux/kmemleak.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * include/linux/kmemleak.h + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas + */ + +#ifndef __KMEMLEAK_H +#define __KMEMLEAK_H + +#include +#include + +#ifdef CONFIG_DEBUG_KMEMLEAK + +extern void kmemleak_init(void) __init; +extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) __ref; +extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) __ref; +extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) __ref; +extern void kmemleak_free(const void *ptr) __ref; +extern void kmemleak_free_part(const void *ptr, size_t size) __ref; +extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; +extern void kmemleak_update_trace(const void *ptr) __ref; +extern void kmemleak_not_leak(const void *ptr) __ref; +extern void kmemleak_ignore(const void *ptr) __ref; +extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; +extern void kmemleak_no_scan(const void *ptr) __ref; +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, + gfp_t gfp) __ref; +extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; +extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; + +static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, + int min_count, slab_flags_t flags, + gfp_t gfp) +{ + if (!(flags & SLAB_NOLEAKTRACE)) + kmemleak_alloc(ptr, size, min_count, gfp); +} + +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) +{ + if (!(flags & SLAB_NOLEAKTRACE)) + kmemleak_free(ptr); +} + +static inline void kmemleak_erase(void **ptr) +{ + *ptr = NULL; +} + +#else + +static inline void kmemleak_init(void) +{ +} +static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) +{ +} +static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, + int min_count, slab_flags_t flags, + gfp_t gfp) +{ +} +static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) +{ +} +static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) +{ +} +static inline void kmemleak_free(const void *ptr) +{ +} +static inline void kmemleak_free_part(const void *ptr, size_t size) +{ +} +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) +{ +} +static inline void kmemleak_free_percpu(const void __percpu *ptr) +{ +} +static inline void kmemleak_update_trace(const void *ptr) +{ +} +static inline void kmemleak_not_leak(const void *ptr) +{ +} +static inline void kmemleak_ignore(const void *ptr) +{ +} +static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) +{ +} +static inline void kmemleak_erase(void **ptr) +{ +} +static inline void kmemleak_no_scan(const void *ptr) +{ +} +static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, + gfp_t gfp) +{ +} +static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) +{ +} +static inline void kmemleak_ignore_phys(phys_addr_t phys) +{ +} + +#endif /* CONFIG_DEBUG_KMEMLEAK */ + +#endif /* __KMEMLEAK_H */ diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c7362d6..c33b212 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -29,7 +29,7 @@ struct kset; struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; - struct attribute **default_attrs; + const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); }; @@ -48,7 +48,7 @@ struct kobj_attribute { struct kobject { struct kobject *parent; struct kset *kset; - struct kobj_type *ktype; + const struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ atomic_t ref; unsigned int state_initialized:1; @@ -64,7 +64,7 @@ struct kset { #define kobject_add(...) 0 -static inline void kobject_init(struct kobject *kobj, struct kobj_type *ktype) +static inline void kobject_init(struct kobject *kobj, const struct kobj_type *ktype) { memset(kobj, 0, sizeof(*kobj)); @@ -77,7 +77,7 @@ static inline void kobject_del(struct kobject *kobj); static inline void kobject_cleanup(struct kobject *kobj) { - struct kobj_type *t = kobj->ktype; + const struct kobj_type *t = kobj->ktype; /* remove from sysfs if the caller did not do it */ if (kobj->state_in_sysfs) diff --git a/include/linux/list.h b/include/linux/list.h index 3639dc9..dcc4745 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -10,6 +10,7 @@ #define list_add(n, h) cds_list_add(n, h) #define list_add_tail(n, h) cds_list_add_tail(n, h) #define __list_del_entry(l) cds_list_del(l) +#define __list_del(p, n) __cds_list_del(p, n) #define list_del(l) cds_list_del(l) #define list_del_init(l) cds_list_del_init(l) #define list_replace(o, n) cds_list_replace(o, n) diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h new file mode 100644 index 0000000..3d62abe --- /dev/null +++ b/include/linux/mean_and_variance.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MEAN_AND_VARIANCE_H_ +#define MEAN_AND_VARIANCE_H_ + +#include +#include +#include +#include + +#define SQRT_U64_MAX 4294967295ULL + + +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) + +typedef unsigned __int128 u128; + +static inline u128 u64_to_u128(u64 a) +{ + return (u128)a; +} + +static inline u64 u128_to_u64(u128 a) +{ + return (u64)a; +} + +static inline u64 u128_shr64_to_u64(u128 a) +{ + return (u64)(a >> 64); +} + +static inline u128 u128_add(u128 a, u128 b) +{ + return a + b; +} + +static inline u128 u128_sub(u128 a, u128 b) +{ + return a - b; +} + +static inline u128 u128_shl(u128 i, s8 shift) +{ + return i << shift; +} + +static inline u128 u128_shl64_add(u64 a, u64 b) +{ + return ((u128)a << 64) + b; +} + +static inline u128 u128_square(u64 i) +{ + return i*i; +} + +#else + +typedef struct { + u64 hi, lo; +} u128; + +static inline u128 u64_to_u128(u64 a) +{ + return (u128){ .lo = a }; +} + +static inline u64 u128_to_u64(u128 a) +{ + return a.lo; +} + +static inline u64 u128_shr64_to_u64(u128 a) +{ + return a.hi; +} + +static inline u128 u128_add(u128 a, u128 b) +{ + u128 c; + + c.lo = a.lo + b.lo; + c.hi = a.hi + b.hi + (c.lo < a.lo); + return c; +} + +static inline u128 u128_sub(u128 a, u128 b) +{ + u128 c; + + c.lo = a.lo - b.lo; + c.hi = a.hi - b.hi - (c.lo > a.lo); + return c; +} + +static inline u128 u128_shl(u128 i, s8 shift) +{ + u128 r; + + r.lo = i.lo << shift; + if (shift < 64) + r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); + else { + r.hi = i.lo << (shift - 64); + r.lo = 0; + } + return r; +} + +static inline u128 u128_shl64_add(u64 a, u64 b) +{ + return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b)); +} + +static inline u128 u128_square(u64 i) +{ + u128 r; + u64 h = i >> 32, l = i & (u64)U32_MAX; + + r = u128_shl(u64_to_u128(h*h), 64); + r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); + r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); + r = u128_add(r, u64_to_u128(l*l)); + return r; +} + +#endif + +static inline u128 u128_div(u128 n, u64 d) +{ + u128 r; + u64 rem; + u64 hi = u128_shr64_to_u64(n); + u64 lo = u128_to_u64(n); + u64 h = hi & ((u64)U32_MAX << 32); + u64 l = (hi & (u64)U32_MAX) << 32; + + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; +} + +struct mean_and_variance { + s64 n; + s64 sum; + u128 sum_squares; +}; + +/* expontentially weighted variant */ +struct mean_and_variance_weighted { + bool init; + u8 w; + s64 mean; + u64 variance; +}; + +inline s64 fast_divpow2(s64 n, u8 d); + +struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); + s64 mean_and_variance_get_mean(struct mean_and_variance s); + u64 mean_and_variance_get_variance(struct mean_and_variance s1); + u32 mean_and_variance_get_stddev(struct mean_and_variance s); + +struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1); + s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); + u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); + u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/mm.h b/include/linux/mm.h new file mode 100644 index 0000000..4bf80ba --- /dev/null +++ b/include/linux/mm.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_MM_H +#define _TOOLS_LINUX_MM_H + +#include + +struct sysinfo { + long uptime; /* Seconds since boot */ + unsigned long loads[3]; /* 1, 5, and 15 minute load averages */ + unsigned long totalram; /* Total usable main memory size */ + unsigned long freeram; /* Available memory size */ + unsigned long sharedram; /* Amount of shared memory */ + unsigned long bufferram; /* Memory used by buffers */ + unsigned long totalswap; /* Total swap space size */ + unsigned long freeswap; /* swap space still available */ + __u16 procs; /* Number of current processes */ + __u16 pad; /* Explicit padding for m68k */ + unsigned long totalhigh; /* Total high memory size */ + unsigned long freehigh; /* Available high memory size */ + __u32 mem_unit; /* Memory unit size in bytes */ +}; + +extern void si_meminfo(struct sysinfo * val); + +#endif /* _TOOLS_LINUX_MM_H */ diff --git a/include/linux/prandom.h b/include/linux/prandom.h new file mode 100644 index 0000000..6f177cd --- /dev/null +++ b/include/linux/prandom.h @@ -0,0 +1,27 @@ +#ifndef _LINUX_PRANDOM_H +#define _LINUX_PRANDOM_H + +#include + +static inline void prandom_bytes(void *buf, int nbytes) +{ + return get_random_bytes(buf, nbytes); +} + +#define prandom_type(type) \ +static inline type prandom_##type(void) \ +{ \ + type v; \ + \ + prandom_bytes(&v, sizeof(v)); \ + return v; \ +} + +prandom_type(int); +prandom_type(long); +prandom_type(u32); +prandom_type(u64); +#undef prandom_type + +#endif /* _LINUX_PRANDOM_H */ + diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h index 13cb826..b14fbe9 100644 --- a/include/linux/prefetch.h +++ b/include/linux/prefetch.h @@ -4,4 +4,7 @@ #define prefetch(p) \ ({ __maybe_unused typeof(p) __var = (p); }) +#define prefetchw(p) \ + ({ __maybe_unused typeof(p) __var = (p); }) + #endif /* _LINUX_PREFETCH_H */ diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h new file mode 100644 index 0000000..f39d8ed --- /dev/null +++ b/include/linux/pretty-printers.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* Copyright (C) 2022 Kent Overstreet */ + +#ifndef _LINUX_PRETTY_PRINTERS_H +#define _LINUX_PRETTY_PRINTERS_H + +void prt_string_option(struct printbuf *, const char * const[], size_t); +void prt_bitflags(struct printbuf *, const char * const[], u64); + +#endif /* _LINUX_PRETTY_PRINTERS_H */ diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h new file mode 100644 index 0000000..24e62e5 --- /dev/null +++ b/include/linux/printbuf.h @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* Copyright (C) 2022 Kent Overstreet */ + +#ifndef _LINUX_PRINTBUF_H +#define _LINUX_PRINTBUF_H + +/* + * Printbufs: Simple strings for printing to, with optional heap allocation + * + * This code has provisions for use in userspace, to aid in making other code + * portable between kernelspace and userspace. + * + * Basic example: + * struct printbuf buf = PRINTBUF; + * + * prt_printf(&buf, "foo="); + * foo_to_text(&buf, foo); + * printk("%s", buf.buf); + * printbuf_exit(&buf); + * + * Or + * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) + * + * We can now write pretty printers instead of writing code that dumps + * everything to the kernel log buffer, and then those pretty-printers can be + * used by other code that outputs to kernel log, sysfs, debugfs, etc. + * + * Memory allocation: Outputing to a printbuf may allocate memory. This + * allocation is done with GFP_KERNEL, by default: use the newer + * memalloc_*_(save|restore) functions as needed. + * + * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations + * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. + * + * It's allowed to grab the output buffer and free it later with kfree() instead + * of using printbuf_exit(), if the user just needs a heap allocated string at + * the end. + * + * Memory allocation failures: We don't return errors directly, because on + * memory allocation failure we usually don't want to bail out and unwind - we + * want to print what we've got, on a best-effort basis. But code that does want + * to return -ENOMEM may check printbuf.allocation_failure. + * + * Indenting, tabstops: + * + * To aid is writing multi-line pretty printers spread across multiple + * functions, printbufs track the current indent level. + * + * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent + * level, respectively. + * + * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from + * start of line. Once set, prt_tab() will output spaces up to the next tabstop. + * prt_tab_rjust() will also advance the current line of text up to the next + * tabstop, but it does so by shifting text since the previous tabstop up to the + * next tabstop - right justifying it. + * + * Make sure you use prt_newline() instead of \n in the format string for indent + * level and tabstops to work corretly. + * + * Output units: printbuf->units exists to tell pretty-printers how to output + * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as + * human readable bytes. prt_units() obeys it. + */ + +#include +#include + +enum printbuf_si { + PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ + PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ +}; + +#define PRINTBUF_INLINE_TABSTOPS 4 + +struct printbuf { + char *buf; + unsigned size; + unsigned pos; + unsigned last_newline; + unsigned last_field; + unsigned indent; + /* + * If nonzero, allocations will be done with GFP_ATOMIC: + */ + u8 atomic; + bool allocation_failure:1; + bool heap_allocated:1; + enum printbuf_si si_units:1; + bool human_readable_units:1; + bool has_indent_or_tabstops:1; + bool suppress_indent_tabstop_handling:1; + u8 nr_tabstops; + + /* + * Do not modify directly: use printbuf_tabstop_add(), + * printbuf_tabstop_get() + */ + u8 cur_tabstop; + u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; +}; + +int printbuf_make_room(struct printbuf *, unsigned); +const char *printbuf_str(const struct printbuf *); +void printbuf_exit(struct printbuf *); + +void printbuf_tabstops_reset(struct printbuf *); +void printbuf_tabstop_pop(struct printbuf *); +int printbuf_tabstop_push(struct printbuf *, unsigned); + +void printbuf_indent_add(struct printbuf *, unsigned); +void printbuf_indent_sub(struct printbuf *, unsigned); + +void prt_newline(struct printbuf *); +void prt_tab(struct printbuf *); +void prt_tab_rjust(struct printbuf *); + +void prt_bytes_indented(struct printbuf *, const char *, unsigned); +void prt_human_readable_u64(struct printbuf *, u64); +void prt_human_readable_s64(struct printbuf *, s64); +void prt_units_u64(struct printbuf *, u64); +void prt_units_s64(struct printbuf *, s64); + +/* Initializer for a heap allocated printbuf: */ +#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) + +/* Initializer a printbuf that points to an external buffer: */ +#define PRINTBUF_EXTERN(_buf, _size) \ +((struct printbuf) { \ + .buf = _buf, \ + .size = _size, \ +}) + +/* + * Returns size remaining of output buffer: + */ +static inline unsigned printbuf_remaining_size(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos : 0; +} + +/* + * Returns number of characters we can print to the output buffer - i.e. + * excluding the terminating nul: + */ +static inline unsigned printbuf_remaining(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos - 1 : 0; +} + +static inline unsigned printbuf_written(struct printbuf *out) +{ + return out->size ? min(out->pos, out->size - 1) : 0; +} + +/* + * Returns true if output was truncated: + */ +static inline bool printbuf_overflowed(struct printbuf *out) +{ + return out->pos >= out->size; +} + +static inline void printbuf_nul_terminate(struct printbuf *out) +{ + printbuf_make_room(out, 1); + + if (out->pos < out->size) + out->buf[out->pos] = 0; + else if (out->size) + out->buf[out->size - 1] = 0; +} + +/* Doesn't call printbuf_make_room(), doesn't nul terminate: */ +static inline void __prt_char_reserved(struct printbuf *out, char c) +{ + if (printbuf_remaining(out)) + out->buf[out->pos] = c; + out->pos++; +} + +/* Doesn't nul terminate: */ +static inline void __prt_char(struct printbuf *out, char c) +{ + printbuf_make_room(out, 1); + __prt_char_reserved(out, c); +} + +static inline void prt_char(struct printbuf *out, char c) +{ + __prt_char(out, c); + printbuf_nul_terminate(out); +} + +static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) +{ + unsigned i, can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = c; + out->pos += n - can_print; +} + +static inline void prt_chars(struct printbuf *out, char c, unsigned n) +{ + printbuf_make_room(out, n); + __prt_chars_reserved(out, c, n); + printbuf_nul_terminate(out); +} + +static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) +{ + unsigned i, can_print; + + printbuf_make_room(out, n); + + can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = ((char *) b)[i]; + out->pos += n - can_print; + + printbuf_nul_terminate(out); +} + +static inline void prt_str(struct printbuf *out, const char *str) +{ + prt_bytes(out, str, strlen(str)); +} + +static inline void prt_str_indented(struct printbuf *out, const char *str) +{ + prt_bytes_indented(out, str, strlen(str)); +} + +static inline void prt_hex_byte(struct printbuf *out, u8 byte) +{ + printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_hi(byte)); + __prt_char_reserved(out, hex_asc_lo(byte)); + printbuf_nul_terminate(out); +} + +static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) +{ + printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_upper_hi(byte)); + __prt_char_reserved(out, hex_asc_upper_lo(byte)); + printbuf_nul_terminate(out); +} + +/** + * printbuf_reset - re-use a printbuf without freeing and re-initializing it: + */ +static inline void printbuf_reset(struct printbuf *buf) +{ + buf->pos = 0; + buf->allocation_failure = 0; + buf->indent = 0; + buf->nr_tabstops = 0; + buf->cur_tabstop = 0; +} + +/** + * printbuf_atomic_inc - mark as entering an atomic section + */ +static inline void printbuf_atomic_inc(struct printbuf *buf) +{ + buf->atomic++; +} + +/** + * printbuf_atomic_inc - mark as leaving an atomic section + */ +static inline void printbuf_atomic_dec(struct printbuf *buf) +{ + buf->atomic--; +} + +/* + * This is used for the %pf(%p) sprintf format extension, where we pass a pretty + * printer and arguments to the pretty-printer to sprintf + * + * Instead of passing a pretty-printer function to sprintf directly, we pass it + * a pointer to a struct call_pp, so that sprintf can check that the magic + * number is present, which in turn ensures that the CALL_PP() macro has been + * used in order to typecheck the arguments to the pretty printer function + * + * Example usage: + * sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev)); + */ +struct call_pp { + unsigned long magic; + void *fn; +}; + +#define PP_TYPECHECK(fn, ...) \ + ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); }) + +#define CALL_PP_MAGIC (unsigned long) 0xce0b92d22f6b6be4 + +#define CALL_PP(fn, ...) \ + (PP_TYPECHECK(fn, ##__VA_ARGS__), \ + &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__ + +#endif /* _LINUX_PRINTBUF_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index bc1619f..df9c192 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -5,6 +5,7 @@ #define pr_fmt(fmt) fmt #endif +#include #include #include @@ -169,7 +170,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...) * ratelimited messages with local ratelimit_state, * no local ratelimit_state used in the !PRINTK case */ -#ifdef CONFIG_PRINTK #define printk_ratelimited(fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ @@ -179,10 +179,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...) if (__ratelimit(&_rs)) \ printk(fmt, ##__VA_ARGS__); \ }) -#else -#define printk_ratelimited(fmt, ...) \ - no_printk(fmt, ##__VA_ARGS__) -#endif #define pr_emerg_ratelimited(fmt, ...) \ printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) diff --git a/include/linux/random.h b/include/linux/random.h index 28c595a..ea101d5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -29,11 +29,6 @@ static inline void get_random_bytes(void *buf, int nbytes) BUG_ON(getrandom(buf, nbytes, 0) != nbytes); } -static inline void prandom_bytes(void *buf, int nbytes) -{ - return get_random_bytes(buf, nbytes); -} - #define get_random_type(type) \ static inline type get_random_##type(void) \ { \ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9d70e6e..f851d6a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -19,6 +19,7 @@ static inline void init_rwsem(struct rw_semaphore *lock) } #define down_read(l) pthread_rwlock_rdlock(&(l)->lock) +#define down_read_killable(l) (pthread_rwlock_rdlock(&(l)->lock), 0) #define down_read_trylock(l) (!pthread_rwlock_tryrdlock(&(l)->lock)) #define up_read(l) pthread_rwlock_unlock(&(l)->lock) diff --git a/include/linux/sched.h b/include/linux/sched.h index 48d20e2..ac6d27b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #define TASK_RUNNING 0 @@ -88,6 +89,10 @@ struct task_struct { pid_t pid; struct bio_list *bio_list; + + struct signal_struct { + struct rw_semaphore exec_update_lock; + } *signal, _signal; }; extern __thread struct task_struct *current; @@ -157,4 +162,11 @@ static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts) #define current_kernel_time64() current_kernel_time() #define CURRENT_TIME (current_kernel_time()) +static inline unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + return 0; +} + #endif /* __TOOLS_LINUX_SCHED_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 626b768..ebbab7a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -11,20 +11,22 @@ struct shrink_control { #define SHRINK_STOP (~0UL) +struct printbuf; struct shrinker { unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); + void (*to_text)(struct printbuf *, struct shrinker *); int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ struct list_head list; }; -int register_shrinker(struct shrinker *); +int register_shrinker(struct shrinker *, const char *, ...); void unregister_shrinker(struct shrinker *); -void run_shrinkers(void); +void run_shrinkers(gfp_t gfp_mask, bool); #endif /* __TOOLS_LINUX_SHRINKER_H */ diff --git a/include/linux/six.h b/include/linux/six.h index 477c33e..362a577 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -59,7 +59,6 @@ */ #include -#include #include #include @@ -105,18 +104,25 @@ enum six_lock_type { struct six_lock { union six_lock_state state; - unsigned intent_lock_recurse; struct task_struct *owner; - struct optimistic_spin_queue osq; unsigned __percpu *readers; - + unsigned intent_lock_recurse; + unsigned long ip; raw_spinlock_t wait_lock; - struct list_head wait_list[2]; + struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; + enum six_lock_type lock_want; + bool lock_acquired; + u64 start_time; +}; + typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); static __always_inline void __six_lock_init(struct six_lock *lock, @@ -125,8 +131,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock, { atomic64_set(&lock->state.counter, 0); raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); + INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *) lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); @@ -146,6 +151,8 @@ do { \ bool six_trylock_##type(struct six_lock *); \ bool six_relock_##type(struct six_lock *, u32); \ int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ +int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \ + six_lock_should_sleep_fn, void *); \ void six_unlock_##type(struct six_lock *); __SIX_LOCK(read) @@ -182,6 +189,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); } +static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p); +} + static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) { SIX_LOCK_DISPATCH(type, six_unlock, lock); @@ -196,8 +210,13 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); -void six_lock_pcpu_free_rcu(struct six_lock *); void six_lock_pcpu_free(struct six_lock *); void six_lock_pcpu_alloc(struct six_lock *); +struct six_lock_count { + unsigned n[3]; +}; + +struct six_lock_count six_lock_counts(struct six_lock *); + #endif /* _LINUX_SIX_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index bc99973..17fe235 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -7,10 +7,14 @@ #include #include +#include #include #include #include +#include +#include + #define ARCH_KMALLOC_MINALIGN 16 #define KMALLOC_MAX_SIZE SIZE_MAX @@ -20,7 +24,7 @@ static inline void *kmalloc(size_t size, gfp_t flags) void *p; do { - run_shrinkers(); + run_shrinkers(flags, i != 0); if (size) { size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE); @@ -58,6 +62,16 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) return new; } +static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + + return krealloc(p, bytes, flags); +} + #define kzalloc(size, flags) kmalloc(size, flags|__GFP_ZERO) #define kmalloc_array(n, size, flags) \ ((size) != 0 && (n) > SIZE_MAX / (size) \ @@ -83,7 +97,7 @@ static inline struct page *alloc_pages(gfp_t flags, unsigned int order) void *p; do { - run_shrinkers(); + run_shrinkers(flags, i != 0); p = aligned_alloc(PAGE_SIZE, size); if (p && (flags & __GFP_ZERO)) @@ -174,4 +188,53 @@ static inline struct kmem_cache *kmem_cache_create(size_t obj_size) #define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct)) +#define PAGE_KERNEL 0 +#define PAGE_KERNEL_EXEC 1 + +#define vfree(p) free(p) + +static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) +{ + unsigned i = 0; + void *p; + + size = round_up(size, PAGE_SIZE); + + do { + run_shrinkers(gfp_mask, i != 0); + + p = aligned_alloc(PAGE_SIZE, size); + if (p && gfp_mask & __GFP_ZERO) + memset(p, 0, size); + } while (!p && i++ < 10); + + return p; +} + +static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) +{ + void *p; + + p = __vmalloc(size, gfp_mask); + if (!p) + return NULL; + + if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) { + vfree(p); + return NULL; + } + + return p; +} + +static inline void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL); +} + +static inline void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL|__GFP_ZERO); +} + #endif /* __TOOLS_LINUX_SLAB_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index c9be6b6..6c4a623 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -2,27 +2,32 @@ #define __TOOLS_LINUX_SPINLOCK_H #include +#include typedef struct { - int count; + pthread_mutex_t lock; } raw_spinlock_t; -#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .count = 0 } +#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER } static inline void raw_spin_lock_init(raw_spinlock_t *lock) { - smp_store_release(&lock->count, 0); + pthread_mutex_init(&lock->lock, NULL); +} + +static inline bool raw_spin_trylock(raw_spinlock_t *lock) +{ + return !pthread_mutex_trylock(&lock->lock); } static inline void raw_spin_lock(raw_spinlock_t *lock) { - while (xchg_acquire(&lock->count, 1)) - ; + pthread_mutex_lock(&lock->lock); } static inline void raw_spin_unlock(raw_spinlock_t *lock) { - smp_store_release(&lock->count, 0); + pthread_mutex_unlock(&lock->lock); } #define raw_spin_lock_irq(lock) raw_spin_lock(lock) diff --git a/include/linux/string.h b/include/linux/string.h index b5e00a0..3ceda3a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -6,6 +6,7 @@ #include /* for size_t */ extern size_t strlcpy(char *dest, const char *src, size_t size); +extern ssize_t strscpy(char *dest, const char *src, size_t count); extern char *strim(char *); extern void memzero_explicit(void *, size_t); int match_string(const char * const *, size_t, const char *); diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h new file mode 100644 index 0000000..af58770 --- /dev/null +++ b/include/linux/string_helpers.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STRING_HELPERS_H_ +#define _LINUX_STRING_HELPERS_H_ + +#include +#include +#include + + +/* Descriptions of the types of units to + * print in */ +enum string_size_units { + STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ + STRING_UNITS_2, /* use binary powers of 2^10 */ +}; + +int string_get_size(u64 size, u64 blk_size, enum string_size_units units, + char *buf, int len); + +#endif diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 3ba2f48..cb75d88 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -10,6 +10,10 @@ struct attribute { umode_t mode; }; +struct attribute_group { + struct attribute **attrs; +}; + struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); diff --git a/include/linux/types.h b/include/linux/types.h index 77f9673..fc05e23 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -6,6 +6,7 @@ #include #include +#include #include #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ @@ -24,7 +25,6 @@ typedef unsigned short umode_t; typedef unsigned gfp_t; -#define GFP_KERNEL 0 #define GFP_ATOMIC 0 #define GFP_NOFS 0 #define GFP_NOIO 0 @@ -35,6 +35,7 @@ typedef unsigned gfp_t; #define __GFP_NORETRY 0 #define __GFP_NOFAIL 0 #define __GFP_ZERO 1 +#define GFP_KERNEL 2 #define PAGE_ALLOC_COSTLY_ORDER 6 @@ -78,4 +79,8 @@ typedef u64 sector_t; typedef int (*cmp_func_t)(const void *a, const void *b); +typedef unsigned int __bitwise slab_flags_t; +typedef u64 phys_addr_t; +struct vm_struct; + #endif /* _TOOLS_LINUX_TYPES_H_ */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ccb319e..55fffb5 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -1,59 +1,6 @@ #ifndef __TOOLS_LINUX_VMALLOC_H #define __TOOLS_LINUX_VMALLOC_H -#include -#include - #include "linux/slab.h" -#include "tools-util.h" - -#define PAGE_KERNEL 0 -#define PAGE_KERNEL_EXEC 1 - -#define vfree(p) free(p) - -static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) -{ - unsigned i = 0; - void *p; - - size = round_up(size, PAGE_SIZE); - - do { - run_shrinkers(); - - p = aligned_alloc(PAGE_SIZE, size); - if (p && gfp_mask & __GFP_ZERO) - memset(p, 0, size); - } while (!p && i++ < 10); - - return p; -} - -static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) -{ - void *p; - - p = __vmalloc(size, gfp_mask); - if (!p) - return NULL; - - if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) { - vfree(p); - return NULL; - } - - return p; -} - -static inline void *vmalloc(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL); -} - -static inline void *vzalloc(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL|__GFP_ZERO); -} #endif /* __TOOLS_LINUX_VMALLOC_H */ diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 0dd1b02..b0fa1ed 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,10 +1,447 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd) and + * the GPLv2 (found in the COPYING file in the root directory of + * https://github.com/facebook/zstd). You may select, at your option, one of the + * above-listed licenses. + */ + +#ifndef LINUX_ZSTD_H +#define LINUX_ZSTD_H + +/** + * This is a kernel-style API that wraps the upstream zstd API, which cannot be + * used directly because the symbols aren't exported. It exposes the minimal + * functionality which is currently required by users of zstd in the kernel. + * Expose extra functions from lib/zstd/zstd.h as needed. + */ + +/* ====== Dependency ====== */ +#include #include +#include + +/* ====== Helper Functions ====== */ +/** + * zstd_compress_bound() - maximum compressed size in worst case scenario + * @src_size: The size of the data to compress. + * + * Return: The maximum compressed size in the worst case scenario. + */ +size_t zstd_compress_bound(size_t src_size); + +/** + * zstd_is_error() - tells if a size_t function result is an error code + * @code: The function result to check for error. + * + * Return: Non-zero iff the code is an error. + */ +unsigned int zstd_is_error(size_t code); + +/** + * enum zstd_error_code - zstd error codes + */ +typedef ZSTD_ErrorCode zstd_error_code; + +/** + * zstd_get_error_code() - translates an error function result to an error code + * @code: The function result for which zstd_is_error(code) is true. + * + * Return: A unique error code for this error. + */ +zstd_error_code zstd_get_error_code(size_t code); + +/** + * zstd_get_error_name() - translates an error function result to a string + * @code: The function result for which zstd_is_error(code) is true. + * + * Return: An error string corresponding to the error code. + */ +const char *zstd_get_error_name(size_t code); + +/** + * zstd_min_clevel() - minimum allowed compression level + * + * Return: The minimum allowed compression level. + */ +int zstd_min_clevel(void); + +/** + * zstd_max_clevel() - maximum allowed compression level + * + * Return: The maximum allowed compression level. + */ +int zstd_max_clevel(void); + +/* ====== Parameter Selection ====== */ + +/** + * enum zstd_strategy - zstd compression search strategy + * + * From faster to stronger. See zstd_lib.h. + */ +typedef ZSTD_strategy zstd_strategy; + +/** + * struct zstd_compression_parameters - zstd compression parameters + * @windowLog: Log of the largest match distance. Larger means more + * compression, and more memory needed during decompression. + * @chainLog: Fully searched segment. Larger means more compression, + * slower, and more memory (useless for fast). + * @hashLog: Dispatch table. Larger means more compression, + * slower, and more memory. + * @searchLog: Number of searches. Larger means more compression and slower. + * @searchLength: Match length searched. Larger means faster decompression, + * sometimes less compression. + * @targetLength: Acceptable match size for optimal parser (only). Larger means + * more compression, and slower. + * @strategy: The zstd compression strategy. + * + * See zstd_lib.h. + */ +typedef ZSTD_compressionParameters zstd_compression_parameters; + +/** + * struct zstd_frame_parameters - zstd frame parameters + * @contentSizeFlag: Controls whether content size will be present in the + * frame header (when known). + * @checksumFlag: Controls whether a 32-bit checksum is generated at the + * end of the frame for error detection. + * @noDictIDFlag: Controls whether dictID will be saved into the frame + * header when using dictionary compression. + * + * The default value is all fields set to 0. See zstd_lib.h. + */ +typedef ZSTD_frameParameters zstd_frame_parameters; + +/** + * struct zstd_parameters - zstd parameters + * @cParams: The compression parameters. + * @fParams: The frame parameters. + */ +typedef ZSTD_parameters zstd_parameters; + +/** + * zstd_get_params() - returns zstd_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. + * + * Return: The selected zstd_parameters. + */ +zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size); + +/* ====== Single-pass Compression ====== */ + +typedef ZSTD_CCtx zstd_cctx; + +/** + * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx + * @parameters: The compression parameters to be used. + * + * If multiple compression parameters might be used, the caller must call + * zstd_cctx_workspace_bound() for each set of parameters and use the maximum + * size. + * + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_cctx(). + */ +size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); + +/** + * zstd_init_cctx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to + * determine how large the workspace must be. + * + * Return: A zstd compression context or NULL on error. + */ +zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); + +/** + * zstd_compress_cctx() - compress src into dst with the initialized parameters + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @parameters: The compression parameters to be used. + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size, const zstd_parameters *parameters); + +/* ====== Single-pass Decompression ====== */ + +typedef ZSTD_DCtx zstd_dctx; + +/** + * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx + * + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_dctx(). + */ +size_t zstd_dctx_workspace_bound(void); + +/** + * zstd_init_dctx() - initialize a zstd decompression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to + * determine how large the workspace must be. + * + * Return: A zstd decompression context or NULL on error. + */ +zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); + +/** + * zstd_decompress_dctx() - decompress zstd compressed src into dst + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. + * + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size); + +/* ====== Streaming Buffers ====== */ + +/** + * struct zstd_in_buffer - input buffer for streaming + * @src: Start of the input buffer. + * @size: Size of the input buffer. + * @pos: Position where reading stopped. Will be updated. + * Necessarily 0 <= pos <= size. + * + * See zstd_lib.h. + */ +typedef ZSTD_inBuffer zstd_in_buffer; + +/** + * struct zstd_out_buffer - output buffer for streaming + * @dst: Start of the output buffer. + * @size: Size of the output buffer. + * @pos: Position where writing stopped. Will be updated. + * Necessarily 0 <= pos <= size. + * + * See zstd_lib.h. + */ +typedef ZSTD_outBuffer zstd_out_buffer; + +/* ====== Streaming Compression ====== */ + +typedef ZSTD_CStream zstd_cstream; + +/** + * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream + * @cparams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_cstream(). + */ +size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); + +/** + * zstd_init_cstream() - initialize a zstd streaming compression context + * @parameters The zstd parameters to use for compression. + * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller + * must pass the source size (zero means empty source). + * Otherwise, the caller may optionally pass the source + * size, or zero if unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. + * Use zstd_cstream_workspace_bound(params->cparams) to + * determine how large the workspace must be. + * + * Return: The zstd streaming compression context or NULL on error. + */ +zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters, + unsigned long long pledged_src_size, void *workspace, size_t workspace_size); + +/** + * zstd_reset_cstream() - reset the context using parameters from creation + * @cstream: The zstd streaming compression context to reset. + * @pledged_src_size: Optionally the source size, or zero if unknown. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. If `pledged_src_size` is non-zero the frame + * content size is always written into the frame header. + * + * Return: Zero or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_reset_cstream(zstd_cstream *cstream, + unsigned long long pledged_src_size); + +/** + * zstd_compress_stream() - streaming compress some of input into output + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * @input: Source buffer. `input->pos` is updated to indicate how much data + * was read. Note that it may not consume the entire input, in which + * case `input->pos < input->size`, and it's up to the caller to + * present remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * + * Return: A hint for the number of bytes to use as the input for the next + * function call or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output, + zstd_in_buffer *input); + +/** + * zstd_flush_stream() - flush internal buffers into output + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * zstd_flush_stream() must be called until it returns 0, meaning all the data + * has been flushed. Since zstd_flush_stream() causes a block to be ended, + * calling it too often will degrade the compression ratio. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using zstd_is_error(). + */ +size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output); + +/** + * zstd_end_stream() - flush internal buffers into output and end the frame + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * zstd_end_stream() must be called until it returns 0, meaning all the data has + * been flushed and the frame epilogue has been written. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using zstd_is_error(). + */ +size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output); + +/* ====== Streaming Decompression ====== */ + +typedef ZSTD_DStream zstd_dstream; + +/** + * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream + * @max_window_size: The maximum window size allowed for compressed frames. + * + * Return: A lower bound on the size of the workspace that is passed + * to zstd_init_dstream(). + */ +size_t zstd_dstream_workspace_bound(size_t max_window_size); + +/** + * zstd_init_dstream() - initialize a zstd streaming decompression context + * @max_window_size: The maximum window size allowed for compressed frames. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use zstd_dstream_workspace_bound(max_window_size) to + * determine how large the workspace must be. + * + * Return: The zstd streaming decompression context. + */ +zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace, + size_t workspace_size); + +/** + * zstd_reset_dstream() - reset the context using parameters from creation + * @dstream: The zstd streaming decompression context to reset. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. + * + * Return: Zero or an error, which can be checked using zstd_is_error(). + */ +size_t zstd_reset_dstream(zstd_dstream *dstream); + +/** + * zstd_decompress_stream() - streaming decompress some of input into output + * @dstream: The zstd streaming decompression context. + * @output: Destination buffer. `output.pos` is updated to indicate how much + * decompressed data was written. + * @input: Source buffer. `input.pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input.pos < input.size`, and it's up to the caller to present + * remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * zstd_decompress_stream() will not consume the last byte of the frame until + * the entire frame is flushed. + * + * Return: Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise returns a hint for the number of bytes to use as the + * input for the next function call or an error, which can be checked + * using zstd_is_error(). The size hint will never load more than the + * frame. + */ +size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + zstd_in_buffer *input); + +/* ====== Frame Inspection Functions ====== */ + +/** + * zstd_find_frame_compressed_size() - returns the size of a compressed frame + * @src: Source buffer. It should point to the start of a zstd encoded + * frame or a skippable frame. + * @src_size: The size of the source buffer. It must be at least as large as the + * size of the frame. + * + * Return: The compressed size of the frame pointed to by `src` or an error, + * which can be check with zstd_is_error(). + * Suitable to pass to ZSTD_decompress() or similar functions. + */ +size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); -#define ZSTD_initDCtx(w, s) ZSTD_initStaticDCtx(w, s) -#define ZSTD_initCCtx(w, s) ZSTD_initStaticCCtx(w, s) +/** + * struct zstd_frame_params - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not + * present. + * @windowSize: The window size, or 0 if the frame is a skippable frame. + * @blockSizeMax: The maximum block size. + * @frameType: The frame type (zstd or skippable) + * @headerSize: The size of the frame header. + * @dictID: The dictionary id, or 0 if not present. + * @checksumFlag: Whether a checksum was used. + * + * See zstd_lib.h. + */ +typedef ZSTD_frameHeader zstd_frame_header; -#define ZSTD_compressCCtx(w, dst, d_len, src, src_len, params) \ - ZSTD_compressCCtx(w, dst, d_len, src, src_len, 0) +/** + * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame + * @params: On success the frame parameters are written here. + * @src: The source buffer. It must point to a zstd or skippable frame. + * @src_size: The size of the source buffer. + * + * Return: 0 on success. If more data is required it returns how many bytes + * must be provided to make forward progress. Otherwise it returns + * an error, which can be checked using zstd_is_error(). + */ +size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, + size_t src_size); -#define ZSTD_CCtxWorkspaceBound(p) ZSTD_estimateCCtxSize(0) -#define ZSTD_DCtxWorkspaceBound() ZSTD_estimateDCtxSize() +#endif /* LINUX_ZSTD_H */ diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h new file mode 100644 index 0000000..58b6dd4 --- /dev/null +++ b/include/linux/zstd_errors.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#define ZSTDERRORLIB_VISIBILITY +#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + + +#endif /* ZSTD_ERRORS_H_398273423 */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a21a392..d3d9e96 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -7,21 +7,29 @@ #include +#define TRACE_BPOS_entries(name) \ + __field(u64, name##_inode ) \ + __field(u64, name##_offset ) \ + __field(u32, name##_snapshot ) + +#define TRACE_BPOS_assign(dst, src) \ + __entry->dst##_inode = (src).inode; \ + __entry->dst##_offset = (src).offset; \ + __entry->dst##_snapshot = (src).snapshot + DECLARE_EVENT_CLASS(bpos, - TP_PROTO(struct bpos *p), + TP_PROTO(const struct bpos *p), TP_ARGS(p), TP_STRUCT__entry( - __field(u64, inode ) - __field(u64, offset ) + TRACE_BPOS_entries(p) ), TP_fast_assign( - __entry->inode = p->inode; - __entry->offset = p->offset; + TRACE_BPOS_assign(p, *p); ), - TP_printk("%llu:%llu", __entry->inode, __entry->offset) + TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) ); DECLARE_EVENT_CLASS(bkey, @@ -44,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey, __entry->offset, __entry->size) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -82,9 +115,29 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* super-io.c: */ +TRACE_EVENT(write_super, + TP_PROTO(struct bch_fs *c, unsigned long ip), + TP_ARGS(c, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->ip = ip; + ), + + TP_printk("%d,%d for %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + (void *) __entry->ip) +); + /* io.c: */ -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); @@ -94,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce, TP_ARGS(bio) ); +DEFINE_EVENT(bio, read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + DEFINE_EVENT(bio, read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, promote, +DEFINE_EVENT(bio, read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); @@ -122,17 +180,21 @@ DEFINE_EVENT(bio, journal_write, ); TRACE_EVENT(journal_reclaim_start, - TP_PROTO(struct bch_fs *c, u64 min_nr, + TP_PROTO(struct bch_fs *c, bool direct, bool kicked, + u64 min_nr, u64 min_key_cache, u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, min_nr, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), TP_STRUCT__entry( __field(dev_t, dev ) + __field(bool, direct ) + __field(bool, kicked ) __field(u64, min_nr ) + __field(u64, min_key_cache ) __field(u64, prereserved ) __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) @@ -143,7 +205,10 @@ TRACE_EVENT(journal_reclaim_start, TP_fast_assign( __entry->dev = c->dev; + __entry->direct = direct; + __entry->kicked = kicked; __entry->min_nr = min_nr; + __entry->min_key_cache = min_key_cache; __entry->prereserved = prereserved; __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; @@ -152,9 +217,12 @@ TRACE_EVENT(journal_reclaim_start, __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->direct, + __entry->kicked, __entry->min_nr, + __entry->min_key_cache, __entry->prereserved, __entry->prereserved_total, __entry->btree_cache_dirty, @@ -177,7 +245,7 @@ TRACE_EVENT(journal_reclaim_finish, __entry->nr_flushed = nr_flushed; ), - TP_printk("%d%d flushed %llu", + TP_printk("%d,%d flushed %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_flushed) ); @@ -185,44 +253,65 @@ TRACE_EVENT(journal_reclaim_finish, /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(struct bpos *p), + TP_PROTO(const struct bpos *p), TP_ARGS(p) ); -/* Btree */ +/* Btree cache: */ -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), +TRACE_EVENT(btree_cache_scan, + TP_PROTO(long nr_to_scan, long can_free, long ret), + TP_ARGS(nr_to_scan, can_free, ret), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, id ) - __field(u64, inode ) - __field(u64, offset ) + __field(long, nr_to_scan ) + __field(long, can_free ) + __field(long, ret ) ), TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->id = b->c.btree_id; - __entry->inode = b->key.k.p.inode; - __entry->offset = b->key.k.p.offset; + __entry->nr_to_scan = nr_to_scan; + __entry->can_free = can_free; + __entry->ret = ret; ), - TP_printk("%d,%d %u id %u %llu:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, __entry->id, - __entry->inode, __entry->offset) + TP_printk("scanned for %li nodes, can free %li, ret %li", + __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_read, +DEFINE_EVENT(btree_node, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(btree_write, +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Btree */ + +DEFINE_EVENT(btree_node, btree_node_read, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_node_write, TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), TP_ARGS(b, bytes, sectors), @@ -252,268 +341,340 @@ DEFINE_EVENT(btree_node, btree_node_free, TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_node_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - TRACE_EVENT(btree_reserve_get_fail, - TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), - TP_ARGS(c, required, cl), + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + size_t required), + TP_ARGS(trans_fn, caller_ip, required), TP_STRUCT__entry( - __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) __field(size_t, required ) - __field(struct closure *, cl ) ), TP_fast_assign( - __entry->dev = c->dev; - __entry->required = required; - __entry->cl = cl; + strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->required = required; ), - TP_printk("%d,%d required %zu by %p", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->required, __entry->cl) + TP_printk("%s %pS required %zu", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->required) ); -DEFINE_EVENT(btree_node, btree_split, +DEFINE_EVENT(btree_node, btree_node_compact, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_compact, +DEFINE_EVENT(btree_node, btree_node_merge, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_merge, +DEFINE_EVENT(btree_node, btree_node_split, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_set_root, +DEFINE_EVENT(btree_node, btree_node_rewrite, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(btree_cache_scan, - TP_PROTO(unsigned long nr_to_scan_pages, - unsigned long nr_to_scan_nodes, - unsigned long can_free_nodes, - long ret), - TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), +DEFINE_EVENT(btree_node, btree_node_set_root, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_path_relock_fail, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( - __field(unsigned long, nr_to_scan_pages ) - __field(unsigned long, nr_to_scan_nodes ) - __field(unsigned long, can_free_nodes ) - __field(long, ret ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __array(char, node, 24 ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) ), TP_fast_assign( - __entry->nr_to_scan_pages = nr_to_scan_pages; - __entry->nr_to_scan_nodes = nr_to_scan_nodes; - __entry->can_free_nodes = can_free_nodes; - __entry->ret = ret; + struct btree *b = btree_path_node(path, level); + + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); + if (IS_ERR(b)) + strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); + else + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; ), - TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", - __entry->nr_to_scan_pages, - __entry->nr_to_scan_nodes, - __entry->can_free_nodes, - __entry->ret) + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->node, + __entry->iter_lock_seq, + __entry->node_lock_seq) ); -TRACE_EVENT(btree_node_relock_fail, - TP_PROTO(const char *trans_fn, +TRACE_EVENT(btree_path_upgrade_fail, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos, - unsigned long node, - u32 iter_lock_seq, - u32 node_lock_seq), - TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) - __field(unsigned long, node ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __field(u8, locked ) + __field(u8, self_read_count ) + __field(u8, self_intent_count) + __field(u8, read_count ) + __field(u8, intent_count ) __field(u32, iter_lock_seq ) __field(u32, node_lock_seq ) ), TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + struct six_lock_count c; + + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; - __entry->node = node; - __entry->iter_lock_seq = iter_lock_seq; - __entry->node_lock_seq = node_lock_seq; + __entry->btree_id = path->btree_id; + __entry->level = level; + TRACE_BPOS_assign(pos, path->pos); + __entry->locked = btree_node_locked(path, level); + + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_read]; + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; ), - TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - __entry->btree_id, + bch2_btree_ids[__entry->btree_id], __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, - __entry->node, + __entry->level, + __entry->locked, + __entry->self_read_count, + __entry->self_intent_count, + __entry->read_count, + __entry->intent_count, __entry->iter_lock_seq, __entry->node_lock_seq) ); /* Garbage collection */ -DEFINE_EVENT(btree_node, btree_gc_rewrite_node, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(bch_fs, gc_start, +DEFINE_EVENT(bch_fs, gc_gens_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, gc_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, +DEFINE_EVENT(bch_fs, gc_gens_end, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); /* Allocator */ -TRACE_EVENT(alloc_scan, - TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped), - TP_ARGS(ca, found, inc_gen, inc_gen_skipped), +TRACE_EVENT(bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + bool user, u64 bucket), + TP_ARGS(ca, alloc_reserve, user, bucket), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, found ) - __field(u64, inc_gen ) - __field(u64, inc_gen_skipped ) + __field(dev_t, dev ) + __array(char, reserve, 16 ) + __field(bool, user ) + __field(u64, bucket ) ), TP_fast_assign( __entry->dev = ca->dev; - __entry->found = found; - __entry->inc_gen = inc_gen; - __entry->inc_gen_skipped = inc_gen_skipped; + strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->user = user; + __entry->bucket = bucket; ), - TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu", + TP_printk("%d,%d reserve %s user %u bucket %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->found, __entry->inc_gen, __entry->inc_gen_skipped) -); + __entry->reserve, + __entry->user, + __entry->bucket) +); + +TRACE_EVENT(bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + u64 seen, + u64 open, + u64 need_journal_commit, + u64 nouse, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, + seen, open, need_journal_commit, nouse, nonblocking, err), -TRACE_EVENT(invalidate, - TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), - TP_ARGS(ca, offset, sectors), + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, reserve, 16 ) + __field(u64, free ) + __field(u64, avail ) + __field(u64, copygc_wait_amount ) + __field(s64, copygc_waiting_for ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, nouse ) + __field(bool, nonblocking ) + __array(char, err, 32 ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; + strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->free = free; + __entry->avail = avail; + __entry->copygc_wait_amount = copygc_wait_amount; + __entry->copygc_waiting_for = copygc_waiting_for; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->nouse = nouse; + __entry->nonblocking = nonblocking; + strlcpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve, + __entry->free, + __entry->avail, + __entry->copygc_wait_amount, + __entry->copygc_waiting_for, + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->nouse, + __entry->nonblocking, + __entry->err) +); + +TRACE_EVENT(discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err), TP_STRUCT__entry( - __field(unsigned, sectors ) __field(dev_t, dev ) - __field(__u64, offset ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, discarded ) + __array(char, err, 16 ) ), TP_fast_assign( - __entry->dev = ca->dev; - __entry->offset = offset, - __entry->sectors = sectors; + __entry->dev = c->dev; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; + strlcpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("invalidated %u sectors at %d,%d sector=%llu", - __entry->sectors, - MAJOR(__entry->dev), - MINOR(__entry->dev), - __entry->offset) + TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->discarded, + __entry->err) ); -DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), - TP_ARGS(ca, reserve), +TRACE_EVENT(bucket_invalidate, + TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), + TP_ARGS(c, dev, bucket, sectors), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(enum alloc_reserve, reserve ) + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u32, sectors ) + __field(u64, bucket ) ), TP_fast_assign( - __entry->dev = ca->dev; - __entry->reserve = reserve; + __entry->dev = c->dev; + __entry->dev_idx = dev; + __entry->sectors = sectors; + __entry->bucket = bucket; ), - TP_printk("%d,%d reserve %d", + TP_printk("%d:%d invalidated %u:%llu cached sectors %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->reserve) + __entry->dev_idx, __entry->bucket, + __entry->sectors) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), - TP_ARGS(ca, reserve) -); +/* Moving IO */ -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), - TP_ARGS(ca, reserve) +DEFINE_EVENT(bkey, move_extent_read, + TP_PROTO(const struct bkey *k), + TP_ARGS(k) ); -DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), - TP_ARGS(ca, reserve) +DEFINE_EVENT(bkey, move_extent_write, + TP_PROTO(const struct bkey *k), + TP_ARGS(k) ); -/* Moving IO */ - -DEFINE_EVENT(bkey, move_extent, +DEFINE_EVENT(bkey, move_extent_finish, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_alloc_fail, +DEFINE_EVENT(bkey, move_extent_race, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_race, +DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); @@ -592,314 +753,300 @@ TRACE_EVENT(copygc_wait, __entry->wait_amount, __entry->until) ); -DECLARE_EVENT_CLASS(transaction_restart, - TP_PROTO(const char *trans_fn, +/* btree transactions: */ + +DECLARE_EVENT_CLASS(transaction_event, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip), + TP_ARGS(trans, caller_ip), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) ), TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; ), TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ); -DEFINE_EVENT(transaction_restart, transaction_restart_ip, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, transaction_commit, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_restart_injected, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_restart_journal_res_get, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, - TP_PROTO(const char *trans_fn, + +TRACE_EVENT(trans_restart_journal_preres_get, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + unsigned flags), + TP_ARGS(trans, caller_ip, flags), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned, flags ) + ), + + TP_fast_assign( + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + + TP_printk("%s %pS %x", __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->flags) +); + +DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_restart_fault_inject, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_traverse_all, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_traverse_all, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_restart_mark_replicas, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, - TP_PROTO(const char *trans_fn, +DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans_fn, caller_ip) + TP_ARGS(trans, caller_ip) ); DECLARE_EVENT_CLASS(transaction_restart_iter, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos), + struct btree_path *path), + TP_ARGS(trans, caller_ip, path), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) + TRACE_BPOS_entries(pos) ), TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %u pos %llu:%llu:%u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u", __entry->trans_fn, (void *) __entry->caller_ip, - __entry->btree_id, + bch2_btree_ids[__entry->btree_id], __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, - TP_PROTO(const char *trans_fn, +TRACE_EVENT(trans_restart_upgrade, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) -); + struct btree_path *path, + unsigned old_locks_want, + unsigned new_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), -DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) -); + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, old_locks_want ) + __field(u8, new_locks_want ) + TRACE_BPOS_entries(pos) + ), -DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + TP_fast_assign( + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = new_locks_want; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_locks_want, + __entry->new_locks_want) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos), - TP_ARGS(trans_fn, caller_ip, btree_id, pos) + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_would_deadlock, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - bool in_traverse_all, - unsigned reason, - enum btree_id have_btree_id, - unsigned have_iter_type, - struct bpos *have_pos, - enum btree_id want_btree_id, - unsigned want_iter_type, - struct bpos *want_pos), - TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason, - have_btree_id, have_iter_type, have_pos, - want_btree_id, want_iter_type, want_pos), - - TP_STRUCT__entry( - __array(char, trans_fn, 24 ) - __field(unsigned long, caller_ip ) - __field(u8, in_traverse_all ) - __field(u8, reason ) - __field(u8, have_btree_id ) - __field(u8, have_iter_type ) - __field(u8, want_btree_id ) - __field(u8, want_iter_type ) - - __field(u64, have_pos_inode ) - __field(u64, have_pos_offset ) - __field(u32, have_pos_snapshot) - __field(u32, want_pos_snapshot) - __field(u64, want_pos_inode ) - __field(u64, want_pos_offset ) - ), - - TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->in_traverse_all = in_traverse_all; - __entry->reason = reason; - __entry->have_btree_id = have_btree_id; - __entry->have_iter_type = have_iter_type; - __entry->want_btree_id = want_btree_id; - __entry->want_iter_type = want_iter_type; - - __entry->have_pos_inode = have_pos->inode; - __entry->have_pos_offset = have_pos->offset; - __entry->have_pos_snapshot = have_pos->snapshot; +DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); - __entry->want_pos_inode = want_pos->inode; - __entry->want_pos_offset = want_pos->offset; - __entry->want_pos_snapshot = want_pos->snapshot; - ), +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); - TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->in_traverse_all, - __entry->reason, - __entry->have_btree_id, - __entry->have_iter_type, - __entry->have_pos_inode, - __entry->have_pos_offset, - __entry->have_pos_snapshot, - __entry->want_btree_id, - __entry->want_iter_type, - __entry->want_pos_inode, - __entry->want_pos_offset, - __entry->want_pos_snapshot) +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) ); TRACE_EVENT(trans_restart_would_deadlock_write, - TP_PROTO(const char *trans_fn), - TP_ARGS(trans_fn), + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) ), TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ), TP_printk("%s", __entry->trans_fn) ); TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(const char *trans_fn, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, unsigned long bytes), - TP_ARGS(trans_fn, caller_ip, bytes), + TP_ARGS(trans, caller_ip, bytes), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned long, bytes ) ), TP_fast_assign( - strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->bytes = bytes; ), @@ -910,6 +1057,44 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes) ); +TRACE_EVENT(trans_restart_key_cache_key_realloced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_u64s, + unsigned new_u64s), + TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(enum btree_id, btree_id ) + TRACE_BPOS_entries(pos) + __field(u32, old_u64s ) + __field(u32, new_u64s ) + ), + + TP_fast_assign( + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + __entry->old_u64s = old_u64s; + __entry->new_u64s = new_u64s; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_u64s, + __entry->new_u64s) +); + #endif /* _TRACE_BCACHE_H */ /* This part must be outside protection */ diff --git a/libbcachefs.c b/libbcachefs.c index f78ebf0..4fe2c3d 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -365,503 +365,6 @@ struct bch_sb *__bch2_super_read(int fd, u64 sector) return ret; } -static unsigned get_dev_has_data(struct bch_sb *sb, unsigned dev) -{ - struct bch_sb_field_replicas *replicas; - struct bch_replicas_entry *r; - unsigned i, data_has = 0; - - replicas = bch2_sb_get_replicas(sb); - - if (replicas) - for_each_replicas_entry(replicas, r) - for (i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - - return data_has; -} - -static int bch2_sb_get_target(struct bch_sb *sb, char *buf, size_t len, u64 v) -{ - struct target t = target_decode(v); - int ret; - - switch (t.type) { - case TARGET_NULL: - return scnprintf(buf, len, "none"); - case TARGET_DEV: { - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_member *m = mi->members + t.dev; - - if (bch2_dev_exists(sb, mi, t.dev)) { - char uuid_str[40]; - - uuid_unparse(m->uuid.b, uuid_str); - - ret = scnprintf(buf, len, "Device %u (%s)", t.dev, - uuid_str); - } else { - ret = scnprintf(buf, len, "Bad device %u", t.dev); - } - - break; - } - case TARGET_GROUP: { - struct bch_sb_field_disk_groups *gi; - gi = bch2_sb_get_disk_groups(sb); - - struct bch_disk_group *g = gi->entries + t.group; - - if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) { - ret = scnprintf(buf, len, "Label %u (%.*s)", t.group, - BCH_SB_LABEL_SIZE, g->label); - } else { - ret = scnprintf(buf, len, "Bad label %u", t.group); - } - break; - } - default: - BUG(); - } - - return ret; -} - -/* superblock printing: */ - -static void bch2_sb_print_layout(struct bch_sb *sb, enum units units) -{ - struct bch_sb_layout *l = &sb->layout; - unsigned i; - - printf(" type: %u\n" - " superblock max size: %s\n" - " nr superblocks: %u\n" - " Offsets: ", - l->layout_type, - pr_units(1 << l->sb_max_size_bits, units), - l->nr_superblocks); - - for (i = 0; i < l->nr_superblocks; i++) { - if (i) - printf(", "); - printf("%llu", le64_to_cpu(l->sb_offset[i])); - } - putchar('\n'); -} - -static void bch2_sb_print_journal(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - unsigned i, nr = bch2_nr_journal_buckets(journal); - - printf(" Buckets: "); - for (i = 0; i < nr; i++) { - if (i) - putchar(' '); - printf("%llu", le64_to_cpu(journal->buckets[i])); - } - putchar('\n'); -} - -static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_members *mi = field_to_type(f, members); - struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); - unsigned i; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - time_t last_mount = le64_to_cpu(m->last_mount); - char member_uuid_str[40]; - char data_allowed_str[100]; - char data_has_str[100]; - char label [BCH_SB_LABEL_SIZE+10]; - char time_str[64]; - - if (!bch2_member_exists(m)) - continue; - - uuid_unparse(m->uuid.b, member_uuid_str); - - if (BCH_MEMBER_GROUP(m)) { - unsigned idx = BCH_MEMBER_GROUP(m) - 1; - - if (idx < disk_groups_nr(gi)) { - scnprintf(label, sizeof(label), "%.*s (%u)", - BCH_SB_LABEL_SIZE, - gi->entries[idx].label, idx); - } else { - strcpy(label, "(bad disk labels section)"); - } - } else { - strcpy(label, "(none)"); - } - - bch2_flags_to_text(&PBUF(data_allowed_str), - bch2_data_types, - BCH_MEMBER_DATA_ALLOWED(m)); - if (!data_allowed_str[0]) - strcpy(data_allowed_str, "(none)"); - - bch2_flags_to_text(&PBUF(data_has_str), - bch2_data_types, - get_dev_has_data(sb, i)); - if (!data_has_str[0]) - strcpy(data_has_str, "(none)"); - - if (last_mount) { - struct tm *tm = localtime(&last_mount); - size_t err = strftime(time_str, sizeof(time_str), "%c", tm); - if (!err) - strcpy(time_str, "(formatting error)"); - } else { - strcpy(time_str, "(never)"); - } - - printf(" Device %u:\n" - " UUID: %s\n" - " Size: %s\n" - " Bucket size: %s\n" - " First bucket: %u\n" - " Buckets: %llu\n" - " Last mount: %s\n" - " State: %s\n" - " Group: %s\n" - " Data allowed: %s\n" - - " Has data: %s\n" - - " Discard: %llu\n", - i, member_uuid_str, - pr_units(le16_to_cpu(m->bucket_size) * - le64_to_cpu(m->nbuckets), units), - pr_units(le16_to_cpu(m->bucket_size), units), - le16_to_cpu(m->first_bucket), - le64_to_cpu(m->nbuckets), - time_str, - - BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(m)] - : "unknown", - - label, - data_allowed_str, - data_has_str, - - BCH_MEMBER_DISCARD(m)); - } -} - -static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - printf(" KFD: %llu\n" - " scrypt n: %llu\n" - " scrypt r: %llu\n" - " scrypt p: %llu\n", - BCH_CRYPT_KDF_TYPE(crypt), - BCH_KDF_SCRYPT_N(crypt), - BCH_KDF_SCRYPT_R(crypt), - BCH_KDF_SCRYPT_P(crypt)); -} - -static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0); - struct bch_replicas_entry_v0 *e; - unsigned i; - - for_each_replicas_entry(replicas, e) { - printf_pad(32, " %s:", bch2_data_types[e->data_type]); - - putchar('['); - for (i = 0; i < e->nr_devs; i++) { - if (i) - putchar(' '); - printf("%u", e->devs[i]); - } - printf("]\n"); - } -} - -static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_replicas *replicas = field_to_type(f, replicas); - struct bch_replicas_entry *e; - unsigned i; - - for_each_replicas_entry(replicas, e) { - printf_pad(32, " %s: %u/%u", - bch2_data_types[e->data_type], - e->nr_required, - e->nr_devs); - - putchar('['); - for (i = 0; i < e->nr_devs; i++) { - if (i) - putchar(' '); - printf("%u", e->devs[i]); - } - printf("]\n"); - } -} - -static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ -} - -static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ -} - -static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - - printf(" flags: %x", le32_to_cpu(clean->flags)); - printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq)); -} - -static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) -{ - struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); - unsigned i, nr = blacklist_nr_entries(bl); - - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - printf(" %llu-%llu\n", - le64_to_cpu(e->start), - le64_to_cpu(e->end)); - } -} - -typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); - -struct bch_sb_field_toolops { - sb_field_print_fn print; -}; - -static const struct bch_sb_field_toolops bch2_sb_field_ops[] = { -#define x(f, nr) \ - [BCH_SB_FIELD_##f] = { \ - .print = bch2_sb_print_##f, \ - }, - BCH_SB_FIELDS() -#undef x -}; - -static inline void bch2_sb_field_print(struct bch_sb *sb, - struct bch_sb_field *f, - enum units units) -{ - unsigned type = le32_to_cpu(f->type); - - if (type < BCH_SB_FIELD_NR) - bch2_sb_field_ops[type].print(sb, f, units); - else - printf("(unknown field %u)\n", type); -} - -void bch2_sb_print(struct bch_sb *sb, bool print_layout, - unsigned fields, enum units units) -{ - struct bch_sb_field_members *mi; - char user_uuid_str[40], internal_uuid_str[40]; - char features_str[500]; - char compat_features_str[500]; - char fields_have_str[200]; - char label[BCH_SB_LABEL_SIZE + 1]; - char time_str[64]; - char foreground_str[64]; - char background_str[64]; - char promote_str[64]; - char metadata_str[64]; - struct bch_sb_field *f; - u64 fields_have = 0; - unsigned nr_devices = 0; - time_t time_base = le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC; - - memcpy(label, sb->label, BCH_SB_LABEL_SIZE); - label[BCH_SB_LABEL_SIZE] = '\0'; - - uuid_unparse(sb->user_uuid.b, user_uuid_str); - uuid_unparse(sb->uuid.b, internal_uuid_str); - - if (time_base) { - struct tm *tm = localtime(&time_base); - size_t err = strftime(time_str, sizeof(time_str), "%c", tm); - if (!err) - strcpy(time_str, "(formatting error)"); - } else { - strcpy(time_str, "(not set)"); - } - - mi = bch2_sb_get_members(sb); - if (mi) { - struct bch_member *m; - - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) - nr_devices += bch2_member_exists(m); - } - - bch2_sb_get_target(sb, foreground_str, sizeof(foreground_str), - BCH_SB_FOREGROUND_TARGET(sb)); - - bch2_sb_get_target(sb, background_str, sizeof(background_str), - BCH_SB_BACKGROUND_TARGET(sb)); - - bch2_sb_get_target(sb, promote_str, sizeof(promote_str), - BCH_SB_PROMOTE_TARGET(sb)); - - bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str), - BCH_SB_METADATA_TARGET(sb)); - - bch2_flags_to_text(&PBUF(features_str), - bch2_sb_features, - le64_to_cpu(sb->features[0])); - - bch2_flags_to_text(&PBUF(compat_features_str), - bch2_sb_compat, - le64_to_cpu(sb->compat[0])); - - vstruct_for_each(sb, f) - fields_have |= 1 << le32_to_cpu(f->type); - bch2_flags_to_text(&PBUF(fields_have_str), - bch2_sb_fields, fields_have); - - printf("External UUID: %s\n" - "Internal UUID: %s\n" - "Device index: %u\n" - "Label: %s\n" - "Version: %u\n" - "Oldest version on disk: %u\n" - "Created: %s\n" - "Squence number: %llu\n" - "Block_size: %s\n" - "Btree node size: %s\n" - "Error action: %s\n" - "Clean: %llu\n" - "Features: %s\n" - "Compat features: %s\n" - - "Metadata replicas: %llu\n" - "Data replicas: %llu\n" - - "Metadata checksum type: %s (%llu)\n" - "Data checksum type: %s (%llu)\n" - "Compression type: %s (%llu)\n" - - "Foreground write target: %s\n" - "Background write target: %s\n" - "Promote target: %s\n" - "Metadata target: %s\n" - - "String hash type: %s (%llu)\n" - "32 bit inodes: %llu\n" - "GC reserve percentage: %llu%%\n" - "Root reserve percentage: %llu%%\n" - - "Devices: %u live, %u total\n" - "Sections: %s\n" - "Superblock size: %llu\n", - user_uuid_str, - internal_uuid_str, - sb->dev_idx, - label, - le16_to_cpu(sb->version), - le16_to_cpu(sb->version_min), - time_str, - le64_to_cpu(sb->seq), - pr_units(le16_to_cpu(sb->block_size), units), - pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units), - - BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR - ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)] - : "unknown", - - BCH_SB_CLEAN(sb), - features_str, - compat_features_str, - - BCH_SB_META_REPLICAS_WANT(sb), - BCH_SB_DATA_REPLICAS_WANT(sb), - - BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR - ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)] - : "unknown", - BCH_SB_META_CSUM_TYPE(sb), - - BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR - ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)] - : "unknown", - BCH_SB_DATA_CSUM_TYPE(sb), - - BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR - ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)] - : "unknown", - BCH_SB_COMPRESSION_TYPE(sb), - - foreground_str, - background_str, - promote_str, - metadata_str, - - BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR - ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)] - : "unknown", - BCH_SB_STR_HASH_TYPE(sb), - - BCH_SB_INODE_32BIT(sb), - BCH_SB_GC_RESERVE(sb), - BCH_SB_ROOT_RESERVE(sb), - - nr_devices, sb->nr_devices, - fields_have_str, - vstruct_bytes(sb)); - - if (print_layout) { - printf("\n" - "Layout:\n"); - bch2_sb_print_layout(sb, units); - } - - vstruct_for_each(sb, f) { - unsigned type = le32_to_cpu(f->type); - char name[60]; - - if (!(fields & (1 << type))) - continue; - - if (type < BCH_SB_FIELD_NR) { - scnprintf(name, sizeof(name), "%s", bch2_sb_fields[type]); - name[0] = toupper(name[0]); - } else { - scnprintf(name, sizeof(name), "(unknown field %u)", type); - } - - printf("\n%s (size %llu):\n", name, vstruct_bytes(f)); - if (type < BCH_SB_FIELD_NR) - bch2_sb_field_print(sb, f, units); - } -} - /* ioctl interface: */ /* Global control device: */ @@ -1094,6 +597,7 @@ next: struct bch_opts bch2_parse_opts(struct bch_opt_strs strs) { struct bch_opts opts = bch2_opts_empty(); + struct printbuf err = PRINTBUF; unsigned i; int ret; u64 v; @@ -1103,17 +607,16 @@ struct bch_opts bch2_parse_opts(struct bch_opt_strs strs) bch2_opt_table[i].type == BCH_OPT_FN) continue; - ret = bch2_opt_parse(NULL, "option", + ret = bch2_opt_parse(NULL, &bch2_opt_table[i], - strs.by_id[i], &v); + strs.by_id[i], &v, &err); if (ret < 0) - die("Invalid %s: %s", - bch2_opt_table[i].attr.name, - strerror(-ret)); + die("Invalid option %s", err.buf); bch2_opt_set_by_id(&opts, i, v); } + printbuf_exit(&err); return opts; } @@ -1186,7 +689,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs) struct dirent *d; dev_names devs; - darray_init(devs); + darray_init(&devs); while ((errno = 0), (d = readdir(dir))) { struct dev_name n = { 0, NULL, NULL }; @@ -1210,7 +713,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs) n.label = read_file_str(fs.sysfs_fd, label_attr); free(label_attr); - darray_append(devs, n); + darray_push(&devs, n); } closedir(dir); diff --git a/libbcachefs.h b/libbcachefs.h index ab4f0cd..17e8eef 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -79,8 +79,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs, void bch2_super_write(int, struct bch_sb *); struct bch_sb *__bch2_super_read(int, u64); -void bch2_sb_print(struct bch_sb *, bool, unsigned, enum units); - /* ioctl interface: */ int bcachectl_open(void); @@ -239,7 +237,7 @@ struct dev_name { char *label; uuid_le uuid; }; -typedef darray(struct dev_name) dev_names; +typedef DARRAY(struct dev_name) dev_names; dev_names bchu_fs_get_devices(struct bchfs_handle); diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 5070caf..9592541 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -173,7 +173,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, bkey_xattr_init(&xattr->k_i); xattr->k.u64s = u64s; xattr->v.x_type = acl_to_xattr_type(type); - xattr->v.x_name_len = 0, + xattr->v.x_name_len = 0; xattr->v.x_val_len = cpu_to_le16(acl_len); acl_header = xattr_val(&xattr->v); @@ -236,7 +236,7 @@ retry: &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); if (ret) { - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (ret != -ENOENT) acl = ERR_PTR(ret); @@ -335,7 +335,7 @@ retry: btree_err: bch2_trans_iter_exit(&trans, &inode_iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 023db62..796b9f5 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -14,6 +15,7 @@ #include "debug.h" #include "ec.h" #include "error.h" +#include "lru.h" #include "recovery.h" #include "varint.h" @@ -26,12 +28,7 @@ #include #include -const char * const bch2_allocator_states[] = { -#define x(n) #n, - ALLOC_THREAD_STATES() -#undef x - NULL -}; +/* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, @@ -39,7 +36,17 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; -/* Persistent alloc info: */ +struct bkey_alloc_unpacked { + u64 journal_seq; + u8 gen; + u8 oldest_gen; + u8 data_type; + bool need_discard:1; + bool need_inc_gen:1; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS_V2() +#undef x +}; static inline u64 alloc_field_v1_get(const struct bch_alloc *a, const void **p, unsigned field) @@ -161,6 +168,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; + out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); + out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); out->journal_seq = le64_to_cpu(a.v->journal_seq); #define x(_name, _bits) \ @@ -182,53 +191,9 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, return 0; } -static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) +static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { - struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - u8 *out = a->v.data; - u8 *end = (void *) &dst[1]; - u8 *last_nonzero_field = out; - unsigned bytes; - - a->k.p = POS(src.dev, src.bucket); - a->v.gen = src.gen; - a->v.oldest_gen = src.oldest_gen; - a->v.data_type = src.data_type; - a->v.journal_seq = cpu_to_le64(src.journal_seq); - -#define x(_name, _bits) \ - nr_fields++; \ - \ - if (src._name) { \ - out += bch2_varint_encode_fast(out, src._name); \ - \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } else { \ - *out++ = 0; \ - } - - BCH_ALLOC_FIELDS_V2() -#undef x - BUG_ON(out > end); - - out = last_nonzero_field; - a->v.nr_fields = last_nonzero_fieldnr; - - bytes = (u8 *) out - (u8 *) &a->v; - set_bkey_val_bytes(&a->k, bytes); - memset_u64s_tail(&a->v, 0, bytes); -} - -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -{ - struct bkey_alloc_unpacked ret = { - .dev = k.k->p.inode, - .bucket = k.k->p.offset, - .gen = 0, - }; + struct bkey_alloc_unpacked ret = { .gen = 0 }; switch (k.k->type) { case KEY_TYPE_alloc: @@ -245,653 +210,1189 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, - const struct bkey_alloc_unpacked src) +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) { - struct bkey_alloc_buf *dst; + unsigned i, bytes = offsetof(struct bch_alloc, data); - dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (!IS_ERR(dst)) - bch2_alloc_pack_v3(dst, src); + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) + if (a->fields & (1 << i)) + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; - return dst; + return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_alloc_unpacked *u, unsigned trigger_flags) +int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + + /* allow for unknown fields */ + if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { + prt_printf(err, "incorrect value size (%zu < %u)", + bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); + return -EINVAL; + } - return PTR_ERR_OR_ZERO(a) ?: - bch2_trans_update(trans, iter, &a->k, trigger_flags); + return 0; } -static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - unsigned i, bytes = offsetof(struct bch_alloc, data); + struct bkey_alloc_unpacked u; - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) - if (a->fields & (1 << i)) - bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; + if (bch2_alloc_unpack_v2(&u, k)) { + prt_printf(err, "unpack error"); + return -EINVAL; + } - return DIV_ROUND_UP(bytes, sizeof(u64)); + return 0; } -const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + struct bkey_alloc_unpacked u; - if (k.k->p.inode >= c->sb.nr_devices || - !c->devs[k.k->p.inode]) - return "invalid device"; + if (bch2_alloc_unpack_v3(&u, k)) { + prt_printf(err, "unpack error"); + return -EINVAL; + } - /* allow for unknown fields */ - if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) - return "incorrect value size"; + return 0; +} + +int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + + if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%lu != %u)", + bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); + return -EINVAL; + } + + if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { + prt_printf(err, "invalid backpointers_start"); + return -EINVAL; + } + + if (rw == WRITE) { + if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { + prt_printf(err, "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + return -EINVAL; + } - return NULL; + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + if (a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe) { + prt_printf(err, "empty data type free but have data"); + return -EINVAL; + } + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + if (!a.v->dirty_sectors) { + prt_printf(err, "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); + return -EINVAL; + } + break; + case BCH_DATA_cached: + if (!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe) { + prt_printf(err, "data type inconsistency"); + return -EINVAL; + } + + if (!a.v->io_time[READ] && + test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { + prt_printf(err, "cached bucket with read_time == 0"); + return -EINVAL; + } + break; + case BCH_DATA_stripe: + if (!a.v->stripe) { + prt_printf(err, "data_type %s but stripe==0", + bch2_data_types[a.v->data_type]); + return -EINVAL; + } + break; + } + } + + return 0; } -const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +static inline u64 swab40(u64 x) { - struct bkey_alloc_unpacked u; + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + +void bch2_alloc_v4_swab(struct bkey_s k) +{ + struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; + struct bch_backpointer *bp, *bps; + + a->journal_seq = swab64(a->journal_seq); + a->flags = swab32(a->flags); + a->dirty_sectors = swab32(a->dirty_sectors); + a->cached_sectors = swab32(a->cached_sectors); + a->io_time[0] = swab64(a->io_time[0]); + a->io_time[1] = swab64(a->io_time[1]); + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); + + bps = alloc_v4_backpointers(a); + for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { + bp->bucket_offset = swab40(bp->bucket_offset); + bp->bucket_len = swab32(bp->bucket_len); + bch2_bpos_swab(&bp->pos); + } +} - if (k.k->p.inode >= c->sb.nr_devices || - !c->devs[k.k->p.inode]) - return "invalid device"; +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = &_a; + const struct bch_backpointer *bps; + unsigned i; - if (bch2_alloc_unpack_v2(&u, k)) - return "unpack error"; + if (k.k->type == KEY_TYPE_alloc_v4) + a = bkey_s_c_to_alloc_v4(k).v; + else + bch2_alloc_to_v4(k, &_a); + + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "gen %u oldest_gen %u data_type %s", + a->gen, a->oldest_gen, bch2_data_types[a->data_type]); + prt_newline(out); + prt_printf(out, "journal_seq %llu", a->journal_seq); + prt_newline(out); + prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_newline(out); + prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_newline(out); + prt_printf(out, "dirty_sectors %u", a->dirty_sectors); + prt_newline(out); + prt_printf(out, "cached_sectors %u", a->cached_sectors); + prt_newline(out); + prt_printf(out, "stripe %u", a->stripe); + prt_newline(out); + prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); + prt_newline(out); + prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); + prt_newline(out); + prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); + prt_newline(out); + prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a)); + printbuf_indent_add(out, 2); + + bps = alloc_v4_backpointers_c(a); + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) { + prt_newline(out); + bch2_backpointer_to_text(out, &bps[i]); + } - return NULL; + printbuf_indent_sub(out, 4); } -const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) +void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { - struct bkey_alloc_unpacked u; + if (k.k->type == KEY_TYPE_alloc_v4) { + int d; + + *out = *bkey_s_c_to_alloc_v4(k).v; + + d = (int) BCH_ALLOC_V4_U64s - + (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0); + if (unlikely(d > 0)) { + memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out), + 0, + d * sizeof(u64)); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } + } else { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { + .journal_seq = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, + .data_type = u.data_type, + .stripe_redundancy = u.stripe_redundancy, + .dirty_sectors = u.dirty_sectors, + .cached_sectors = u.cached_sectors, + .io_time[READ] = u.read_time, + .io_time[WRITE] = u.write_time, + .stripe = u.stripe, + }; + + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } +} + +static noinline struct bkey_i_alloc_v4 * +__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_i_alloc_v4 *ret; + unsigned bytes = k.k->type == KEY_TYPE_alloc_v4 + ? bkey_bytes(k.k) + : sizeof(struct bkey_i_alloc_v4); + + /* + * Reserve space for one more backpointer here: + * Not sketchy at doing it this way, nope... + */ + ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); + if (IS_ERR(ret)) + return ret; - if (k.k->p.inode >= c->sb.nr_devices || - !c->devs[k.k->p.inode]) - return "invalid device"; + if (k.k->type == KEY_TYPE_alloc_v4) { + struct bch_backpointer *src, *dst; - if (bch2_alloc_unpack_v3(&u, k)) - return "unpack error"; + bkey_reassemble(&ret->k_i, k); - return NULL; + src = alloc_v4_backpointers(&ret->v); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(&ret->v); + + memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * + sizeof(struct bch_backpointer)); + memset(src, 0, dst - src); + set_alloc_v4_u64s(ret); + } else { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; + bch2_alloc_to_v4(k, &ret->v); + } + return ret; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + if (likely(k.k->type == KEY_TYPE_alloc_v4) && + BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { + /* + * Reserve space for one more backpointer here: + * Not sketchy at doing it this way, nope... + */ + struct bkey_i_alloc_v4 *ret = + bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer)); + if (!IS_ERR(ret)) + bkey_reassemble(&ret->k_i, k); + return ret; + } - pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", - u.gen, u.oldest_gen, bch2_data_types[u.data_type], - u.journal_seq); -#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); - BCH_ALLOC_FIELDS_V2() -#undef x + return __bch2_alloc_to_v4_mut(trans, k); +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + return bch2_alloc_to_v4_mut_inlined(trans, k); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) +{ + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + int ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } + + a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) + bch2_trans_iter_exit(trans, iter); + return a; } -int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) +int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct bch_alloc_v4 a; struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked u; int ret; bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, gc); - u = bch2_alloc_unpack(k); - - if (!gc) - *bucket_gen(ca, k.k->p.offset) = u.gen; - - g->_mark.gen = u.gen; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = !gc ? u.oldest_gen : u.gen; - g->gen_valid = 1; - - if (!gc || - (metadata_only && - (u.data_type == BCH_DATA_user || - u.data_type == BCH_DATA_cached || - u.data_type == BCH_DATA_parity))) { - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - } + bch2_alloc_to_v4(k, &a); + *bucket_gen(ca, k.k->p.offset) = a.gen; } bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) - bch_err(c, "error reading alloc info: %i", ret); + bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); return ret; } -/* Bucket IO clocks: */ +/* Free space/discard btree: */ -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) +static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, + const struct bch_alloc_v4 *a, + bool set) { struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); struct btree_iter iter; - struct bkey_s_c k; - struct bkey_alloc_unpacked u; - u64 *time, now; - int ret = 0; + struct bkey_s_c old; + struct bkey_i *k; + enum btree_id btree; + enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; + enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + struct printbuf buf = PRINTBUF; + int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; + if (a->data_type != BCH_DATA_free && + a->data_type != BCH_DATA_need_discard) + return 0; - u = bch2_alloc_unpack(k); + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); - time = rw == READ ? &u.read_time : &u.write_time; - now = atomic64_read(&c->io_clock[rw].now); - if (*time == now) - goto out; + bkey_init(&k->k); + k->k.type = new_type; - *time = now; + switch (a->data_type) { + case BCH_DATA_free: + btree = BTREE_ID_freespace; + k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); + bch2_key_resize(&k->k, 1); + break; + case BCH_DATA_need_discard: + btree = BTREE_ID_need_discard; + k->k.p = alloc_k.k->p; + break; + default: + return 0; + } - ret = bch2_alloc_write(trans, &iter, &u, 0) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -out: + bch2_trans_iter_init(trans, &iter, btree, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + old = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(old); + if (ret) + goto err; + + if (ca->mi.freespace_initialized && + bch2_trans_inconsistent_on(old.k->type != old_type, trans, + "incorrect key when %s %s btree (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", + bch2_btree_ids[btree], + bch2_bkey_types[old.k->type], + bch2_bkey_types[old_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = -EIO; + goto err; + } + + ret = bch2_trans_update(trans, &iter, k, 0); +err: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } -/* Background allocator thread: */ - -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ - -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, - struct bucket_mark m) +int bch2_trans_mark_alloc(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { - u8 gc_gen; - - if (!is_available_bucket(m)) - return false; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a, *new_a; + u64 old_lru, new_lru; + int ret = 0; - if (m.owned_by_allocator) - return false; + /* + * Deletion only happens in the device removal path, with + * BTREE_TRIGGER_NORUN: + */ + BUG_ON(new->k.type != KEY_TYPE_alloc_v4); - if (ca->buckets_nouse && - test_bit(b, ca->buckets_nouse)) - return false; + bch2_alloc_to_v4(old, &old_a); + new_a = &bkey_i_to_alloc_v4(new)->v; - if (ca->new_fs_bucket_idx) { - /* - * Device or filesystem is still being initialized, and we - * haven't fully marked superblocks & journal: - */ - if (is_superblock_bucket(ca, b)) - return false; + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); - if (b < ca->new_fs_bucket_idx) - return false; + if (new_a->dirty_sectors > old_a.dirty_sectors || + new_a->cached_sectors > old_a.cached_sectors) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } - gc_gen = bucket_gc_gen(bucket(ca, b)); - - ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; - ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } - return gc_gen < BUCKET_GC_GEN_MAX; -} + if (old_a.data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, &old_a, false) ?: + bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); + if (ret) + return ret; + } -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - */ + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); -static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, - u64 now, u64 last_seq_ondisk) -{ - unsigned used = m.cached_sectors; + old_lru = alloc_lru_idx(old_a); + new_lru = alloc_lru_idx(*new_a); - if (used) { - /* - * Prefer to keep buckets that have been read more recently, and - * buckets that have more data in them: - */ - u64 last_read = max_t(s64, 0, now - g->io_time[READ]); - u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, + old_lru, &new_lru, old); + if (ret) + return ret; - return -last_read_scaled; - } else { - /* - * Prefer to use buckets with smaller gc_gen so that we don't - * have to walk the btree and recalculate oldest_gen - but shift - * off the low bits so that buckets will still have equal sort - * keys when there's only a small difference, so that we can - * keep sequential buckets together: - */ - return bucket_gc_gen(g) >> 4; + if (new_a->data_type == BCH_DATA_cached) + new_a->io_time[READ] = new_lru; } -} -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) -{ - return cmp_int(l.key, r.key) ?: - cmp_int(r.nr, l.nr) ?: - cmp_int(l.bucket, r.bucket); + return 0; } -static inline int bucket_idx_cmp(const void *_l, const void *_r) +static int bch2_check_alloc_key(struct btree_trans *trans, + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter) { - const struct alloc_heap_entry *l = _l, *r = _r; + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bch_alloc_v4 a; + unsigned discard_key_type, freespace_key_type; + struct bkey_s_c alloc_k, k; + struct printbuf buf = PRINTBUF; + int ret; - return cmp_int(l->bucket, r->bucket); -} + alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos) + ? bch2_btree_iter_peek_slot(alloc_iter) + : bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 1; -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - u64 now, last_seq_ondisk; - size_t b, i, nr = 0; + ret = bkey_err(alloc_k); + if (ret) + return ret; - down_read(&ca->bucket_lock); + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, + "alloc key for invalid device:bucket %llu:%llu", + alloc_k.k->p.inode, alloc_k.k->p.offset)) + return bch2_btree_delete_at(trans, alloc_iter, 0); - buckets = bucket_array(ca); - ca->alloc_heap.used = 0; - now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.flushed_seq_ondisk; + ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + if (!ca->mi.freespace_initialized) + return 0; - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket *g = &buckets->b[b]; - struct bucket_mark m = READ_ONCE(g->mark); - unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); + bch2_alloc_to_v4(alloc_k, &a); - cond_resched(); + discard_key_type = a.data_type == BCH_DATA_need_discard + ? KEY_TYPE_set : 0; + freespace_key_type = a.data_type == BCH_DATA_free + ? KEY_TYPE_set : 0; - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a)); - if (!m.data_type && - bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - last_seq_ondisk, - ca->dev_idx, b)) { - ca->buckets_waiting_on_journal++; - continue; - } + k = bch2_btree_iter_peek_slot(discard_iter); + ret = bkey_err(k); + if (ret) + goto err; - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } + if (k.k->type != discard_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = discard_key_type; + update->k.p = discard_iter->pos; + + ret = bch2_trans_update(trans, discard_iter, update, 0); + if (ret) + goto err; } - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != freespace_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; + bkey_init(&update->k); + update->k.type = freespace_key_type; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, 1); - while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { - nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k; + struct bch_alloc_v4 a; + u64 genbits; + struct bpos pos; + enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard + ? BCH_DATA_need_discard + : BCH_DATA_free; + struct printbuf buf = PRINTBUF; + int ret; + + pos = iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = iter->pos.offset & (~0ULL << 56); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); - up_read(&ca->bucket_lock); + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) + goto delete; + + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + goto err; + + bch2_alloc_to_v4(alloc_k, &a); + + if (fsck_err_on(a.data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(a)), c, + "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_ids[iter->btree_id], + a.data_type == state, + genbits >> 56, alloc_freespace_genbits(a) >> 56)) + goto delete; +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +delete: + ret = bch2_btree_delete_extent_at(trans, iter, + iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0); + goto out; } -static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +int bch2_check_alloc_info(struct bch_fs *c) { - size_t i, nr = 0; + struct btree_trans trans; + struct btree_iter iter, discard_iter, freespace_iter; + struct bkey_s_c k; + int ret = 0; - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; - ca->buckets_waiting_on_journal = 0; + bch2_trans_init(&trans, c, 0, 0); - find_reclaimable_buckets_lru(c, ca); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_check_alloc_key(&trans, &iter, + &discard_iter, + &freespace_iter)); + if (ret) + break; - heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(&trans, &freespace_iter); + bch2_trans_iter_exit(&trans, &discard_iter); + bch2_trans_iter_exit(&trans, &iter); - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; + if (ret < 0) + goto err; - return nr; + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_discard_freespace_key(&trans, &iter)) ?: + for_each_btree_key_commit(&trans, iter, + BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_discard_freespace_key(&trans, &iter)); +err: + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } -static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_alloc_unpacked *u) +static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + struct btree_iter *alloc_iter) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter lru_iter; + struct bch_alloc_v4 a; + struct bkey_s_c alloc_k, k; + struct printbuf buf = PRINTBUF; + struct printbuf buf2 = PRINTBUF; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; - k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(alloc_k); + if (ret) + return ret; + + bch2_alloc_to_v4(alloc_k, &a); + + if (a.data_type != BCH_DATA_cached) + return 0; + + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, + POS(alloc_k.k->p.inode, a.io_time[READ]), 0); + + k = bch2_btree_iter_peek_slot(&lru_iter); ret = bkey_err(k); if (ret) goto err; - *u = bch2_alloc_unpack(k); - u->gen++; - u->data_type = 0; - u->dirty_sectors = 0; - u->cached_sectors = 0; - u->read_time = atomic64_read(&c->io_clock[READ].now); - u->write_time = atomic64_read(&c->io_clock[WRITE].now); + if (fsck_err_on(!a.io_time[READ], c, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || + fsck_err_on(k.k->type != KEY_TYPE_lru || + le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, + "incorrect/missing lru entry\n" + " %s\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + u64 read_time = a.io_time[READ]; + + if (!a.io_time[READ]) + a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + + ret = bch2_lru_set(trans, + alloc_k.k->p.inode, + alloc_k.k->p.offset, + &a.io_time[READ]); + if (ret) + goto err; + + if (a.io_time[READ] != read_time) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; - ret = bch2_alloc_write(trans, &iter, u, - BTREE_TRIGGER_BUCKET_INVALIDATE); + a_mut->v.io_time[READ] = a.io_time[READ]; + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } + } err: - bch2_trans_iter_exit(trans, &iter); +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf2); + printbuf_exit(&buf); return ret; } -static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, unsigned flags) +int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct bkey_alloc_unpacked u; - size_t b; - u64 commit_seq = 0; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; int ret = 0; - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; + bch2_trans_init(&trans, c, 0, 0); - BUG_ON(!ca->alloc_heap.used || - !ca->alloc_heap.data[0].nr); - b = ca->alloc_heap.data[0].bucket; + for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(&trans, &iter)); - /* first, put on free_inc and mark as owned by allocator: */ - percpu_down_read(&c->mark_lock); + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} + +static int bch2_discard_one_bucket(struct btree_trans *trans, + struct btree_iter *need_discard_iter, + struct bpos *discard_pos_done, + u64 *seen, + u64 *open, + u64 *need_journal_commit, + u64 *discarded) +{ + struct bch_fs *c = trans->c; + struct bpos pos = need_discard_iter->pos; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct bch_dev *ca; + struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; + bool did_discard = false; + int ret = 0; - bch2_mark_alloc_bucket(c, ca, b, true); + ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { + bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); + return 0; + } - spin_lock(&c->freelist_lock); - verify_not_on_freelist(c, ca, b); - BUG_ON(!fifo_push(&ca->free_inc, b)); - spin_unlock(&c->freelist_lock); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { + (*open)++; + goto out; + } - percpu_up_read(&c->mark_lock); + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + pos.inode, pos.offset)) { + (*need_journal_commit)++; + goto out; + } - ret = bch2_trans_do(c, NULL, &commit_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - flags, - bucket_invalidate_btree(&trans, ca, b, &u)); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + need_discard_iter->pos, + BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; - if (!ret) { - /* remove from alloc_heap: */ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; - top->bucket++; - top->nr--; + if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; + } - if (!top->nr) - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans, + "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" + "%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto out; + } - /* - * If we invalidating cached data then we need to wait on the - * journal commit: - */ - if (u.data_type) - *journal_seq = max(*journal_seq, commit_seq); + if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans, + "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto out; + } + if (bkey_cmp(*discard_pos_done, iter.pos) && + ca->mi.discard && !c->opts.nochanges) { /* - * We already waiting on u.alloc_seq when we filtered out - * buckets that need journal commit: + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree */ - BUG_ON(*journal_seq > u.journal_seq); - } else { - size_t b2; + bch2_trans_unlock(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); - /* remove from free_inc: */ - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); + ret = bch2_trans_relock(trans); + if (ret) + goto out; + } - bch2_mark_alloc_bucket(c, ca, b, false); + *discard_pos_done = iter.pos; + did_discard = true; - BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); - BUG_ON(b != b2); + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); +write: + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + if (ret) + goto out; - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); + if (did_discard) { + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; } +out: + bch2_trans_iter_exit(trans, &iter); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} - return ret < 0 ? ret : 0; +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct bpos discard_pos_done = POS_MAX; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + /* + * We're doing the commit in bch2_discard_one_bucket instead of using + * for_each_btree_key_commit() so that we can increment counters after + * successful commit: + */ + ret = for_each_btree_key2(&trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, + &seen, + &open, + &need_journal_commit, + &discarded)); + + bch2_trans_exit(&trans); + + if (need_journal_commit * 2 > seen) + bch2_journal_flush_async(&c->journal, NULL); + + percpu_ref_put(&c->writes); + + trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + bch2_err_str(ret)); +} + +void bch2_do_discards(struct bch_fs *c) +{ + if (percpu_ref_tryget_live(&c->writes) && + !queue_work(system_long_wq, &c->discard_work)) + percpu_ref_put(&c->writes); } -/* - * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: - */ -static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) +static int invalidate_one_bucket(struct btree_trans *trans, + struct btree_iter *lru_iter, struct bkey_s_c k, + unsigned dev_idx, s64 *nr_to_invalidate) { - u64 journal_seq = 0; + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bkey_i_alloc_v4 *a; + struct bpos bucket; + struct printbuf buf = PRINTBUF; + unsigned cached_sectors; int ret = 0; - /* Only use nowait if we've already invalidated at least one bucket: */ - while (!ret && - !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) { - if (kthread_should_stop()) { - ret = 1; - break; + if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx) + return 1; + + if (k.k->type != KEY_TYPE_lru) { + prt_printf(&buf, "non lru key in lru btree:\n "); + bch2_bkey_val_to_text(&buf, c, k); + + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; } - ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, - (!fifo_empty(&ca->free_inc) - ? BTREE_INSERT_NOWAIT : 0)); - /* - * We only want to batch up invalidates when they're going to - * require flushing the journal: - */ - if (!journal_seq) - break; + goto out; } - /* If we used NOWAIT, don't return the error: */ - if (!fifo_empty(&ca->free_inc)) - ret = 0; - if (ret < 0) - bch_err(ca, "error invalidating buckets: %i", ret); + bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx)); + + a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + ret = PTR_ERR_OR_ZERO(a); if (ret) - return ret; + goto out; - if (journal_seq) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) { - bch_err(ca, "journal error: %i", ret); - return ret; - } + if (k.k->p.offset != alloc_lru_idx(a->v)) { + prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); - return 0; -} + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; + } -static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -{ - if (ca->allocator_state != new_state) { - ca->allocator_state = new_state; - closure_wake_up(&ca->fs->freelist_wait); + goto out; } + + if (!a->v.cached_sectors) + bch_err(c, "invalidating empty bucket, confused"); + + cached_sectors = a->v.cached_sectors; + + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + a->v.gen++; + a->v.data_type = 0; + a->v.dirty_sectors = 0; + a->v.cached_sectors = 0; + a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, + BTREE_TRIGGER_BUCKET_INVALIDATE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + if (ret) + goto out; + + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); + --*nr_to_invalidate; +out: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; } -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +static void bch2_do_invalidates_work(struct work_struct *work) { + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; unsigned i; int ret = 0; - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (i == RESERVE_MOVINGGC && - !test_bit(BCH_FS_STARTED, &c->flags)) - continue; + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + + ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru, + POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k, + invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate)); - if (fifo_push(&ca->free[i], b)) { - fifo_pop(&ca->free_inc, b); - ret = 1; + if (ret < 0) { + percpu_ref_put(&ca->ref); break; } } - spin_unlock(&c->freelist_lock); - ca->allocator_state = ret - ? ALLOCATOR_running - : ALLOCATOR_blocked_full; - closure_wake_up(&c->freelist_wait); - return ret; + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); } -static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +void bch2_do_invalidates(struct bch_fs *c) { - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), - ca->mi.bucket_size, GFP_NOFS, 0); + if (percpu_ref_tryget_live(&c->writes) && + !queue_work(system_long_wq, &c->invalidate_work)) + percpu_ref_put(&c->writes); } -static bool allocator_thread_running(struct bch_dev *ca) +static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct bch_dev *ca) { - unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) - ? ALLOCATOR_running - : ALLOCATOR_stopped; - alloc_thread_set_state(ca, state); - return state == ALLOCATOR_running; -} + struct bch_alloc_v4 a; -static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -{ - s64 available = dev_buckets_reclaimable(ca) - - (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); - bool ret = available > 0; + if (iter->pos.offset >= ca->mi.nbuckets) + return 1; - alloc_thread_set_state(ca, ret - ? ALLOCATOR_running - : ALLOCATOR_blocked); - return ret; + bch2_alloc_to_v4(k, &a); + return bch2_bucket_do_index(trans, k, &a, true); } -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) { - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - unsigned long gc_count = c->gc_count; - size_t nr; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_member *m; int ret; - set_freezable(); + bch2_trans_init(&trans, c, 0, 0); - while (1) { - ret = kthread_wait_freezable(allocator_thread_running(ca)); - if (ret) - goto stop; + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW, + bucket_freespace_init(&trans, &iter, k, ca)); - while (!ca->alloc_heap.used) { - cond_resched(); + bch2_trans_exit(&trans); - ret = kthread_wait_freezable(buckets_available(ca, gc_count)); - if (ret) - goto stop; - - gc_count = c->gc_count; - nr = find_reclaimable_buckets(c, ca); - - if (!nr && ca->buckets_waiting_on_journal) { - ret = bch2_journal_flush(&c->journal); - if (ret) - goto stop; - } else if (nr < (ca->mi.nbuckets >> 6) && - ca->buckets_waiting_on_journal >= nr / 2) { - bch2_journal_flush_async(&c->journal, NULL); - } + if (ret < 0) { + bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); + return ret; + } - if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || - ca->inc_gen_really_needs_gc) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } + mutex_lock(&c->sb_lock); + m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); - } + return 0; +} - ret = bch2_invalidate_buckets(c, ca); - if (ret) - goto stop; +int bch2_fs_freespace_init(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + bool doing_init = false; - while (!fifo_empty(&ca->free_inc)) { - u64 b = fifo_peek(&ca->free_inc); + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ - discard_one_bucket(c, ca, b); + for_each_member_device(ca, c, i) { + if (ca->mi.freespace_initialized) + continue; - ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); - if (ret) - goto stop; + if (!doing_init) { + bch_info(c, "initializing freespace"); + doing_init = true; + } + + ret = bch2_dev_freespace_init(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; } } -stop: - alloc_thread_set_state(ca, ALLOCATOR_stopped); - return 0; + + if (doing_init) { + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch_verbose(c, "done initializing freespace"); + } + + return ret; +} + +/* Bucket IO clocks: */ + +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + u64 now; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + now = atomic64_read(&c->io_clock[rw].now); + if (a->v.io_time[rw] == now) + goto out; + + a->v.io_time[rw] = now; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; } /* Startup/shutdown (ro/rw): */ @@ -902,7 +1403,7 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i, j; + unsigned i; lockdep_assert_held(&c->state_lock); @@ -933,8 +1434,9 @@ void bch2_recalc_capacity(struct bch_fs *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_NONE; j++) - dev_reserve += ca->free[j].size; + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ @@ -990,8 +1492,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; - BUG_ON(ca->alloc_thread); - /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -1065,62 +1565,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -{ - if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_running); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch-alloc/%s", ca->name); - if (IS_ERR(p)) { - bch_err(ca->fs, "error creating allocator thread: %li", - PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 98c7866..044bc72 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -8,73 +8,128 @@ #include "debug.h" #include "super.h" -extern const char * const bch2_allocator_states[]; - -struct bkey_alloc_unpacked { - u64 journal_seq; - u64 bucket; - u8 dev; - u8 gen; - u8 oldest_gen; - u8 data_type; -#define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS_V2() -#undef x -}; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U -/* returns true if not equal */ -static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, - struct bkey_alloc_unpacked r) +static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) +{ + struct bch_dev *ca; + + if (!bch2_dev_exists2(c, pos.inode)) + return false; + + ca = bch_dev_bkey_exists(c, pos.inode); + return pos.offset >= ca->mi.first_bucket && + pos.offset < ca->mi.nbuckets; +} + +static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) +{ + return a.gen - a.oldest_gen; +} + +static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, + u32 cached_sectors, + u32 stripe, + struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (dirty_sectors) + return data_type; + if (stripe) + return BCH_DATA_stripe; + if (cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + return __alloc_data_type(a.dirty_sectors, a.cached_sectors, + a.stripe, a, data_type); +} + +static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; +} + +static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) +{ + return ((u64) alloc_gc_gen(a) >> 4) << 56; +} + +static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) +{ + pos.offset |= alloc_freespace_genbits(a); + return pos; +} + +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0) + + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * + (sizeof(struct bch_backpointer) / sizeof(u64)); + + BUG_ON(ret > U8_MAX - BKEY_U64s); + return ret; +} + +static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) { - return l.gen != r.gen || - l.oldest_gen != r.oldest_gen || - l.data_type != r.data_type -#define x(_name, ...) || l._name != r._name - BCH_ALLOC_FIELDS_V2() -#undef x - ; -} - -struct bkey_alloc_buf { - struct bkey_i k; - struct bch_alloc_v3 v; - -#define x(_name, _bits) + _bits / 8 - u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; -#undef x -} __attribute__((packed, aligned(8))); - -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, - const struct bkey_alloc_unpacked); -int bch2_alloc_write(struct btree_trans *, struct btree_iter *, - struct bkey_alloc_unpacked *, unsigned); + set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); + +void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); -const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); -const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ +} + +#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v4_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } static inline bool bkey_is_alloc(const struct bkey *k) @@ -84,44 +139,45 @@ static inline bool bkey_is_alloc(const struct bkey *k) k->type == KEY_TYPE_alloc_v3; } -int bch2_alloc_read(struct bch_fs *, bool, bool); +int bch2_alloc_read(struct bch_fs *); + +int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_check_alloc_info(struct bch_fs *); +int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_do_discards(struct bch_fs *); -static inline void bch2_wake_allocator(struct bch_dev *ca) +static inline u64 should_invalidate_buckets(struct bch_dev *ca, + struct bch_dev_usage u) { - struct task_struct *p; + u64 want_free = ca->mi.nbuckets >> 7; + u64 free = max_t(s64, 0, + u.d[BCH_DATA_free].buckets + + u.d[BCH_DATA_need_discard].buckets + - bch2_dev_buckets_reserved(ca, RESERVE_none)); - rcu_read_lock(); - p = rcu_dereference(ca->alloc_thread); - if (p) - wake_up_process(p); - rcu_read_unlock(); + return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } -static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) +void bch2_do_invalidates(struct bch_fs *); + +static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) { - if (bch2_expensive_debug_checks) { - size_t iter; - long i; - unsigned j; + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); +} - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } +static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); } +int bch2_fs_freespace_init(struct bch_fs *); + void bch2_recalc_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_stop(struct bch_dev *); -int bch2_dev_allocator_start(struct bch_dev *); - void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 9b81ed2..a9e0c73 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -14,19 +14,32 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "io.h" +#include "journal.h" +#include "movinggc.h" #include #include #include #include +const char * const bch2_alloc_reserves[] = { +#define x(t) #t, + BCH_ALLOC_RESERVES() +#undef x + NULL +}; + /* * Open buckets represent a bucket that's currently being allocated from. They * serve two purposes: @@ -78,7 +91,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; ob->data_type = 0; @@ -168,49 +180,45 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) { switch (reserve) { - case RESERVE_BTREE: - case RESERVE_BTREE_MOVINGGC: + case RESERVE_btree: + case RESERVE_btree_movinggc: return 0; - case RESERVE_MOVINGGC: + case RESERVE_movinggc: return OPEN_BUCKETS_COUNT / 4; default: return OPEN_BUCKETS_COUNT / 2; } } -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, - struct closure *cl) +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, + enum alloc_reserve reserve, + struct bch_alloc_v4 *a, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) { struct open_bucket *ob; - long b = 0; - spin_lock(&c->freelist_lock); + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + (*skipped_nouse)++; + return NULL; + } - if (may_alloc_partial) { - int i; - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + (*skipped_open)++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + (*skipped_need_journal_commit)++; + return NULL; } + spin_lock(&c->freelist_lock); + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -219,36 +227,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, c->blocked_allocate_open_bucket = local_clock(); spin_unlock(&c->freelist_lock); - trace_open_bucket_alloc_fail(ca, reserve); - return ERR_PTR(-OPEN_BUCKETS_EMPTY); + return ERR_PTR(-BCH_ERR_open_buckets_empty); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) - goto out; - - switch (reserve) { - case RESERVE_BTREE_MOVINGGC: - case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) - goto out; - break; - default: - break; + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + (*skipped_open)++; + return NULL; } - if (cl) - closure_wait(&c->freelist_wait, cl); - - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); - - spin_unlock(&c->freelist_lock); - - trace_bucket_alloc_fail(ca, reserve); - return ERR_PTR(-FREELIST_EMPTY); -out: - verify_not_on_freelist(c, ca, b); - ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); @@ -257,8 +245,8 @@ out: ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; - ob->gen = *bucket_gen(ca, b); - ob->bucket = b; + ob->gen = a->gen; + ob->bucket = bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; @@ -280,9 +268,343 @@ out: spin_unlock(&c->freelist_lock); - bch2_wake_allocator(ca); + return ob; +} + +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum alloc_reserve reserve, u64 free_entry, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct bkey_s_c freespace_k, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct open_bucket *ob; + struct bch_alloc_v4 a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { + prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" + " freespace key ", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + bch2_alloc_to_v4(k, &a); + + if (genbits != (alloc_freespace_genbits(a) >> 56)) { + prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(a) >> 56); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + + } + + if (a.data_type != BCH_DATA_free) { + prt_printf(&buf, "non free bucket in freespace btree\n" + " freespace key "); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + ob = __try_alloc_bucket(c, ca, b, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + if (!ob) + iter.path->preserve = false; +err: + set_btree_iter_dontneed(&iter); + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) +{ + struct open_bucket *ob; + int i; + + spin_lock(&c->freelist_lock); + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } + + spin_unlock(&c->freelist_lock); + return NULL; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_early(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *cur_bucket, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); + *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a; + + if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + bch2_alloc_to_v4(k, &a); + + if (a.data_type != BCH_DATA_free) + continue; + + (*buckets_seen)++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + *cur_bucket = iter.pos.offset; + + return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); +} + +static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *cur_bucket, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + BUG_ON(ca->new_fs_bucket_idx); + + /* + * XXX: + * On transaction restart, we'd like to restart from the bucket we were + * at previously + */ + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); + *cur_bucket < k.k->p.offset; + (*cur_bucket)++) { + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + + (*buckets_seen)++; + + ob = try_alloc_bucket(trans, ca, reserve, + *cur_bucket, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + k, cl); + if (ob) + break; + } + + if (ob || ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ob ?: ERR_PTR(ret); +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + */ +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl, + struct bch_dev_usage *usage) +{ + struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; + bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); + u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; + u64 avail; + u64 cur_bucket = start; + u64 buckets_seen = 0; + u64 skipped_open = 0; + u64 skipped_need_journal_commit = 0; + u64 skipped_nouse = 0; + bool waiting = false; +again: + bch2_dev_usage_read_fast(ca, usage); + avail = dev_buckets_free(ca, *usage, reserve); + + if (usage->d[BCH_DATA_need_discard].buckets > avail) + bch2_do_discards(c); + + if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + bch2_do_gc_gens(c); + + if (should_invalidate_buckets(ca, *usage)) + bch2_do_invalidates(c); + + if (!avail) { + if (cl && !waiting) { + closure_wait(&c->freelist_wait, cl); + waiting = true; + goto again; + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-BCH_ERR_freelist_empty); + goto err; + } + + if (waiting) + closure_wake_up(&c->freelist_wait); + + if (may_alloc_partial) { + ob = try_alloc_partial_bucket(c, ca, reserve); + if (ob) + return ob; + } + + ob = likely(ca->mi.freespace_initialized) + ? bch2_bucket_alloc_freelist(trans, ca, reserve, + &cur_bucket, + &buckets_seen, + &skipped_open, + &skipped_need_journal_commit, + &skipped_nouse, + cl) + : bch2_bucket_alloc_early(trans, ca, reserve, + &cur_bucket, + &buckets_seen, + &skipped_open, + &skipped_need_journal_commit, + &skipped_nouse, + cl); + + if (skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + + if (!ob && !freespace_initialized && start) { + start = cur_bucket = 0; + goto again; + } + + if (!freespace_initialized) + ca->bucket_alloc_trans_early_cursor = cur_bucket; +err: + if (!ob) + ob = ERR_PTR(-BCH_ERR_no_buckets_found); + + if (!IS_ERR(ob)) + trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], + may_alloc_partial, ob->bucket); + else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) + trace_and_count(c, bucket_alloc_fail, + ca, bch2_alloc_reserves[reserve], + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl == NULL, + bch2_err_str(PTR_ERR(ob))); - trace_bucket_alloc(ca, reserve); + return ob; +} + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct bch_dev_usage usage; + struct open_bucket *ob; + + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, + may_alloc_partial, cl, &usage))); return ob; } @@ -309,11 +631,12 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, return ret; } -void bch2_dev_stripe_increment(struct bch_dev *ca, - struct dev_stripe_state *stripe) +static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, + struct dev_stripe_state *stripe, + struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca); + u64 free_space = dev_buckets_available(ca, RESERVE_none); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -329,6 +652,15 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, *v = *v < scale ? 0 : *v - scale; } +void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) +{ + struct bch_dev_usage usage; + + bch2_dev_usage_read_fast(ca, &usage); + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); +} + #define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) #define BUCKET_ALLOC_USE_DURABILITY (1 << 1) @@ -351,7 +683,7 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -int bch2_bucket_alloc_set(struct bch_fs *c, +static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, @@ -362,46 +694,79 @@ int bch2_bucket_alloc_set(struct bch_fs *c, unsigned flags, struct closure *cl) { + struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; struct bch_dev *ca; - int ret = -INSUFFICIENT_DEVICES; + int ret = -BCH_ERR_insufficient_devices; unsigned i; BUG_ON(*nr_effective >= nr_replicas); for (i = 0; i < devs_sorted.nr; i++) { + struct bch_dev_usage usage; struct open_bucket *ob; - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + if (!ca) continue; - if (!ca->mi.durability && *have_cache) + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); continue; + } + + ob = bch2_bucket_alloc_trans(trans, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + percpu_ref_put(&ca->ref); - ob = bch2_bucket_alloc(c, ca, reserve, - flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { ret = PTR_ERR(ob); - - if (cl) - return ret; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) + break; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(ca, stripe); - - if (*nr_effective >= nr_replicas) - return 0; + if (*nr_effective >= nr_replicas) { + ret = 0; + break; + } } return ret; } +int bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_bucket_alloc_set_trans(&trans, ptrs, stripe, + devs_may_alloc, nr_replicas, + nr_effective, have_cache, reserve, + flags, cl)); +} + /* Allocate from stripes: */ /* @@ -506,7 +871,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static int open_bucket_add_buckets(struct bch_fs *c, +static int open_bucket_add_buckets(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, @@ -519,6 +884,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, unsigned flags, struct closure *_cl) { + struct bch_fs *c = trans->c; struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; @@ -550,8 +916,9 @@ static int open_bucket_add_buckets(struct bch_fs *c, target, erasure_code, nr_replicas, nr_effective, have_cache, flags, _cl); - if (ret == -FREELIST_EMPTY || - ret == -OPEN_BUCKETS_EMPTY) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; if (*nr_effective >= nr_replicas) return 0; @@ -564,25 +931,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, if (*nr_effective >= nr_replicas) return 0; - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && + !cl && _cl) { cl = _cl; goto retry_blocking; } - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; } @@ -696,15 +1060,25 @@ static bool try_decrease_writepoints(struct bch_fs *c, return true; } -static struct write_point *writepoint_find(struct bch_fs *c, +static void bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) +{ + if (!mutex_trylock(lock)) { + bch2_trans_unlock(trans); + mutex_lock(lock); + } +} + +static struct write_point *writepoint_find(struct btree_trans *trans, unsigned long write_point) { + struct bch_fs *c = trans->c; struct write_point *wp, *oldest; struct hlist_head *head; if (!(write_point & 1UL)) { wp = (struct write_point *) write_point; - mutex_lock(&wp->lock); + bch2_trans_mutex_lock(trans, &wp->lock); return wp; } @@ -713,7 +1087,7 @@ restart_find: wp = __writepoint_find(head, write_point); if (wp) { lock_wp: - mutex_lock(&wp->lock); + bch2_trans_mutex_lock(trans, &wp->lock); if (wp->write_point == write_point) goto out; mutex_unlock(&wp->lock); @@ -726,8 +1100,8 @@ restart_find_oldest: if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; - mutex_lock(&oldest->lock); - mutex_lock(&c->write_points_hash_lock); + bch2_trans_mutex_lock(trans, &oldest->lock); + bch2_trans_mutex_lock(trans, &c->write_points_hash_lock); if (oldest >= c->write_points + c->write_points_nr || try_increase_writepoints(c)) { mutex_unlock(&c->write_points_hash_lock); @@ -748,14 +1122,14 @@ restart_find_oldest: hlist_add_head_rcu(&wp->node, head); mutex_unlock(&c->write_points_hash_lock); out: - wp->last_used = sched_clock(); + wp->last_used = local_clock(); return wp; } /* * Get us an open_bucket we can allocate from, return with it locked: */ -struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, +struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned target, unsigned erasure_code, struct write_point_specifier write_point, @@ -766,6 +1140,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned flags, struct closure *cl) { + struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; struct open_buckets ptrs; @@ -785,7 +1160,7 @@ retry: write_points_nr = c->write_points_nr; have_cache = false; - wp = writepoint_find(c, write_point.v); + wp = writepoint_find(trans, write_point.v); if (wp->data_type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; @@ -795,21 +1170,22 @@ retry: have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, ob_flags, cl); } else { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, ob_flags, NULL); - if (!ret) + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, @@ -821,7 +1197,7 @@ alloc_done: if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); - if (ret == -INSUFFICIENT_DEVICES && + if (ret == -BCH_ERR_insufficient_devices && nr_effective >= nr_replicas_required) ret = 0; @@ -852,19 +1228,46 @@ err: mutex_unlock(&wp->lock); - if (ret == -FREELIST_EMPTY && + if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(c, write_points_nr)) goto retry; - switch (ret) { - case -OPEN_BUCKETS_EMPTY: - case -FREELIST_EMPTY: - return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); - case -INSUFFICIENT_DEVICES: + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + return cl + ? ERR_PTR(-EAGAIN) + : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc); + + if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) return ERR_PTR(-EROFS); - default: - BUG(); - } + + return ERR_PTR(ret); +} + +struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) +{ + struct write_point *wp; + + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target, + erasure_code, + write_point, + devs_have, + nr_replicas, + nr_replicas_required, + reserve, + flags, cl))); + return wp; + } struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) @@ -965,7 +1368,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) wp < c->write_points + c->write_points_nr; wp++) { writepoint_init(wp, BCH_DATA_user); - wp->last_used = sched_clock(); + wp->last_used = local_clock(); wp->write_point = (unsigned long) wp; hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); @@ -981,12 +1384,12 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ob++) { spin_lock(&ob->lock); if (ob->valid && !ob->on_partial_list) { - pr_buf(out, "%zu ref %u type %s\n", + prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n", ob - c->open_buckets, atomic_read(&ob->pin), - bch2_data_types[ob->data_type]); + bch2_data_types[ob->data_type], + ob->dev, ob->bucket, ob->gen); } spin_unlock(&ob->lock); } - } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index d466bda..6de63a3 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -12,6 +12,8 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +extern const char * const bch2_alloc_reserves[]; + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -115,11 +117,33 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke return false; } +static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) +{ + bool ret; + + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); + + return ret; +} + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, struct closure *); +struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *); struct write_point *bch2_alloc_sectors_start(struct bch_fs *, unsigned, unsigned, struct write_point_specifier, diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 409232e..e078584 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -10,28 +10,18 @@ struct ec_bucket_buf; -#define ALLOC_THREAD_STATES() \ - x(stopped) \ - x(running) \ - x(blocked) \ - x(blocked_full) - -enum allocator_states { -#define x(n) ALLOCATOR_##n, - ALLOC_THREAD_STATES() -#undef x -}; +#define BCH_ALLOC_RESERVES() \ + x(btree_movinggc) \ + x(btree) \ + x(movinggc) \ + x(none) enum alloc_reserve { - RESERVE_BTREE_MOVINGGC = -2, - RESERVE_BTREE = -1, - RESERVE_MOVINGGC = 0, - RESERVE_NONE = 1, - RESERVE_NR = 2, +#define x(name) RESERVE_##name, + BCH_ALLOC_RESERVES() +#undef x }; -typedef FIFO(long) alloc_fifo; - #define OPEN_BUCKETS_COUNT 1024 #define WRITE_POINT_HASH_NR 32 @@ -53,14 +43,14 @@ struct open_bucket { * the block in the stripe this open_bucket corresponds to: */ u8 ec_idx; - enum bch_data_type data_type:3; + enum bch_data_type data_type:8; unsigned valid:1; unsigned on_partial_list:1; - int alloc_reserve:3; + unsigned alloc_reserve:3; - unsigned sectors_free; u8 dev; u8 gen; + u32 sectors_free; u64 bucket; struct ec_stripe_new *ec; }; @@ -94,12 +84,4 @@ struct write_point_specifier { unsigned long v; }; -struct alloc_heap_entry { - size_t bucket; - size_t nr; - unsigned long key; -}; - -typedef HEAP(struct alloc_heap_entry) alloc_heap; - #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c new file mode 100644 index 0000000..d74de1d --- /dev/null +++ b/libbcachefs/backpointers.c @@ -0,0 +1,1128 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "error.h" + +#include + +#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 + +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc + * btree: + */ +static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, + struct bpos bp_pos) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); +} + +/* + * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: + */ +static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, + struct bpos bucket, + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + struct bpos ret; + + ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + + BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret))); + + return ret; +} + +void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + struct bpos *bucket_pos, struct bch_backpointer *bp) +{ + enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; + s64 sectors = level ? btree_sectors(c) : k.k->size; + u32 bucket_offset; + + *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = data_type, + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = ptr_disk_sectors(sectors, p), + .pos = k.k->p, + }; +} + +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bpos bucket, + struct bch_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket2; + struct bch_backpointer bp2; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, + &bucket2, &bp2); + if (!bpos_cmp(bucket, bucket2) && + !memcmp(&bp, &bp2, sizeof(bp))) + return true; + } + + return false; +} + +int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + + if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) { + prt_str(err, "incorrect value size"); + return -EINVAL; + } + + if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { + prt_str(err, "backpointer at wrong pos"); + return -EINVAL; + } + + return 0; +} + +void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +{ + prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", + bch2_btree_ids[bp->btree_id], + bp->level, + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp->bucket_len); + bch2_bpos_to_text(out, bp->pos); +} + +void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); +} + +void bch2_backpointer_swab(struct bkey_s k) +{ + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + + bp.v->bucket_offset = swab32(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); +} + +#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1) + +static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r) +{ + return cmp_int(l.bucket_offset, r.bucket_offset); +} + +static int bch2_backpointer_del_by_offset(struct btree_trans *trans, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (bp_offset < BACKPOINTER_OFFSET_MAX) { + struct bch_backpointer *bps; + struct bkey_i_alloc_v4 *a; + unsigned i, nr; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_alloc_v4) { + ret = -ENOENT; + goto err; + } + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + bps = alloc_v4_backpointers(&a->v); + nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + + for (i = 0; i < nr; i++) { + if (bps[i].bucket_offset == bp_offset) + goto found; + if (bps[i].bucket_offset > bp_offset) + break; + } + + ret = -ENOENT; + goto err; +found: + if (memcmp(&bps[i], &bp, sizeof(bp))) { + ret = -ENOENT; + goto err; + } + array_remove_item(bps, nr, i); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } else { + bp_offset -= BACKPOINTER_OFFSET_MAX; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket, bp_offset), + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { + ret = -ENOENT; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_bucket_backpointer_del(struct btree_trans *trans, + struct bkey_i_alloc_v4 *a, + struct bch_backpointer bp, + struct bkey_s_c orig_k) +{ + struct bch_fs *c = trans->c; + struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp) ?: + memcmp(&bps[i], &bp, sizeof(bp)); + if (!cmp) + goto found; + if (cmp >= 0) + break; + } + + goto btree; +found: + array_remove_item(bps, nr, i); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + return 0; +btree: + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "backpointer not found when deleting"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "searching for "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + + prt_printf(&buf, "got "); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_str(&buf, "alloc "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + } else { + ret = -EIO; + bch2_trans_inconsistent(trans, "%s", buf.buf); + } + printbuf_exit(&buf); + goto err; + } + + ret = bch2_btree_delete_at(trans, &bp_iter, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +int bch2_bucket_backpointer_add(struct btree_trans *trans, + struct bkey_i_alloc_v4 *a, + struct bch_backpointer bp, + struct bkey_s_c orig_k) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + struct bkey_i_backpointer *bp_k; + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + /* Check for duplicates: */ + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp); + if (cmp >= 0) + break; + } + + if ((i && + (bps[i - 1].bucket_offset + + bps[i - 1].bucket_len > bp.bucket_offset)) || + (i < nr && + (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "overlapping backpointer found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "into "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err(c, "%s", buf.buf); + else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + } + + if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) { + array_insert_item(bps, nr, i, bp); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + return 0; + } + + /* Overflow: use backpointer btree */ + bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + ca = bch_dev_bkey_exists(c, a->k.p.inode); + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); + bp_k->v = bp; + + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "existing btree backpointer key found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "found "); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err(c, "%s", buf.buf); + else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EIO; + goto err; + } + } + + ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +/* + * Find the next backpointer >= *bp_offset: + */ +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + u64 *bp_offset, + struct bch_backpointer *dst, + unsigned iter_flags) +{ + struct bch_fs *c = trans->c; + struct bpos bp_pos, bp_end_pos; + struct btree_iter alloc_iter, bp_iter = { NULL }; + struct bkey_s_c k; + struct bkey_s_c_alloc_v4 a; + size_t i; + int ret; + + if (*bp_offset == U64_MAX) + return 0; + + bp_pos = bucket_pos_to_bp(c, bucket, + max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); + bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto out; + + if (k.k->type != KEY_TYPE_alloc_v4) + goto done; + + a = bkey_s_c_to_alloc_v4(k); + if (gen >= 0 && a.v->gen != gen) + goto done; + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { + if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) + continue; + + *dst = alloc_v4_backpointers_c(a.v)[i]; + *bp_offset = dst->bucket_offset; + goto out; + } + + for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, + bp_pos, 0, k, ret) { + if (bpos_cmp(k.k->p, bp_end_pos) >= 0) + break; + + if (k.k->type != KEY_TYPE_backpointer) + continue; + + *dst = *bkey_s_c_to_backpointer(k).v; + *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX; + goto out; + } +done: + *bp_offset = U64_MAX; +out: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +static void backpointer_not_found(struct btree_trans *trans, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp, + struct bkey_s_c k, + const char *thing_it_points_to) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + thing_it_points_to); + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, "\n "); + + if (bp_offset >= BACKPOINTER_OFFSET_MAX) { + struct bpos bp_pos = + bucket_pos_to_bp(c, bucket, + bp_offset - BACKPOINTER_OFFSET_MAX); + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); + } + + bch2_backpointer_to_text(&buf, &bp); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err_ratelimited(c, "%s", buf.buf); + else + bch2_trans_inconsistent(trans, "%s", buf.buf); + + printbuf_exit(&buf); +} + +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + min(bp.level, c->btree_roots[bp.btree_id].level), + 0); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bp.level == c->btree_roots[bp.btree_id].level + 1) + k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); + + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + + if (bp.level) { + struct btree *b; + + /* + * If a backpointer for a btree node wasn't found, it may be + * because it was overwritten by a new btree node that hasn't + * been written out yet - backpointer_get_node() checks for + * this: + */ + b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); + if (!IS_ERR_OR_NULL(b)) + return bkey_i_to_s_c(&b->key); + + bch2_trans_iter_exit(trans, iter); + + if (IS_ERR(b)) + return bkey_s_c_err(PTR_ERR(b)); + return bkey_s_c_null; + } + + backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); + return bkey_s_c_null; +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct btree *b; + + BUG_ON(!bp.level); + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + bp.level - 1, + 0); + b = bch2_btree_iter_peek_node(iter); + if (IS_ERR(b)) + goto err; + + if (b && extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) + return b; + + if (b && btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + backpointer_not_found(trans, bucket, bp_offset, bp, + bkey_i_to_s_c(&b->key), "btree node"); + b = NULL; + } +err: + bch2_trans_iter_exit(trans, iter); + return b; +} + +static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bch_dev *ca; + struct bkey_s_c alloc_k; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + "backpointer for mising device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bp_pos_to_bucket(c, k.k->p), 0); + + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + goto out; + + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +} + +/* verify that every backpointer has a corresponding alloc key */ +int bch2_check_btree_backpointers(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + + return bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_backpointers, POS_MIN, 0, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + bch2_check_btree_backpointer(&trans, &iter, k))); +} + +static int check_bp_exists(struct btree_trans *trans, + struct bpos bucket_pos, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter, bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c alloc_k, bp_k; + int ret; + + if (bpos_cmp(bucket_pos, bucket_start) < 0 || + bpos_cmp(bucket_pos, bucket_end) > 0) + return 0; + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + goto err; + + if (alloc_k.k->type == KEY_TYPE_alloc_v4) { + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k); + const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); + + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp) ?: + memcmp(&bps[i], &bp, sizeof(bp)); + if (!cmp) + goto out; + if (cmp >= 0) + break; + } + } else { + goto missing; + } + + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), + 0); + bp_k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) + goto missing; +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +missing: + prt_printf(&buf, "missing backpointer for btree=%s l=%u ", + bch2_btree_ids[bp.btree_id], bp.level); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_printf(&buf, "\nin alloc key "); + bch2_bkey_val_to_text(&buf, c, alloc_k); + + if (c->sb.version < bcachefs_metadata_version_backpointers || + c->opts.reconstruct_alloc || + fsck_err(c, "%s", buf.buf)) { + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); + + ret = PTR_ERR_OR_ZERO(a) ?: + bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?: + bch2_trans_update(trans, &alloc_iter, &a->k_i, 0); + } + + goto out; +} + +static int check_extent_to_backpointers(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek_all_levels(iter); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + if (ret) + return ret; + } + + return 0; +} + +static int check_btree_root_to_backpointers(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + int ret; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + c->btree_roots[btree_id].level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + BUG_ON(b != btree_node_root(c, b)); + + k = bkey_i_to_s_c(&b->key); + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + if (ret) + goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + struct sysinfo i; + u64 mem_bytes; + + si_meminfo(&i); + mem_bytes = i.totalram * i.mem_unit; + return (mem_bytes >> 1) / btree_bytes(c); +} + +int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + unsigned btree_leaf_mask, + unsigned btree_interior_mask, + struct bbpos start, struct bbpos *end) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + enum btree_id btree; + int ret = 0; + + for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + + if (!((1U << btree) & btree_leaf_mask) && + !((1U << btree) & btree_interior_mask)) + continue; + + bch2_trans_node_iter_init(trans, &iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, 0); + /* + * for_each_btree_key_contineu() doesn't check the return value + * from bch2_btree_iter_advance(), which is needed when + * iterating over interior nodes where we'll see keys at + * SPOS_MAX: + */ + do { + k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); + ret = bkey_err(k); + if (!k.k || ret) + break; + + --btree_nodes; + if (!btree_nodes) { + *end = BBPOS(btree, k.k->p); + bch2_trans_iter_exit(trans, &iter); + return 0; + } + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(trans, &iter); + } + + *end = BBPOS_MAX; + return ret; +} + +static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct btree_iter iter; + enum btree_id btree_id; + int ret = 0; + + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + depth, + BTREE_ITER_ALL_LEVELS| + BTREE_ITER_PREFETCH); + + do { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent_to_backpointers(trans, &iter, + bucket_start, bucket_end)); + if (ret) + break; + } while (!bch2_btree_iter_advance(&iter)); + + bch2_trans_iter_exit(trans, &iter); + + if (ret) + break; + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_btree_root_to_backpointers(trans, btree_id, + bucket_start, bucket_end)); + if (ret) + break; + } + return ret; +} + +int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct btree_iter alloc_iter; + struct btree_iter bp_iter; + struct bkey_s_c alloc_k, bp_k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + bool alloc_end = false, bp_end = false; + int ret = 0; + + bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + start, 0, 1, 0); + bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0); + while (1) { + alloc_k = !alloc_end + ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) + : bkey_s_c_null; + bp_k = !bp_end + ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) + : bkey_s_c_null; + + ret = bkey_err(alloc_k) ?: bkey_err(bp_k); + if ((!alloc_k.k && !bp_k.k) || ret) { + *end = SPOS_MAX; + break; + } + + --btree_nodes; + if (!btree_nodes) { + *end = alloc_k.k->p; + break; + } + + if (bpos_cmp(alloc_iter.pos, SPOS_MAX) && + bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) { + if (!bch2_btree_iter_advance(&alloc_iter)) + alloc_end = true; + } else { + if (!bch2_btree_iter_advance(&bp_iter)) + bp_end = true; + } + } + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_extents_to_backpointers(struct bch_fs *c) +{ + struct btree_trans trans; + struct bpos start = POS_MIN, end; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + while (1) { + ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); + if (ret) + break; + + if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX)) + bch_verbose(c, "check_extents_to_backpointers(): alloc info does not fit in ram," + "running in multiple passes with %zu nodes per pass", + btree_nodes_fit_in_ram(c)); + + if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_extents_to_backpointers(): "); + bch2_bpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); + if (ret || !bpos_cmp(end, SPOS_MAX)) + break; + + start = bpos_successor(end); + } + bch2_trans_exit(&trans); + + return ret; +} + +static int check_one_backpointer(struct btree_trans *trans, + struct bpos bucket, + u64 *bp_offset, + struct bbpos start, + struct bbpos end) +{ + struct btree_iter iter; + struct bch_backpointer bp; + struct bbpos pos; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0); + if (ret || *bp_offset == U64_MAX) + return ret; + + pos = bp_to_bbpos(bp); + if (bbpos_cmp(pos, start) < 0 || + bbpos_cmp(pos, end) > 0) + return 0; + + k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (fsck_err_on(!k.k, trans->c, + "%s backpointer points to missing extent\n%s", + *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree", + (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { + ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); + if (ret == -ENOENT) + bch_err(trans->c, "backpointer at %llu not found", *bp_offset); + } + + bch2_trans_iter_exit(trans, &iter); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + struct bbpos start, + struct bbpos end) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 bp_offset = 0; + + while (!(ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) && + bp_offset < U64_MAX) + bp_offset++; + + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret < 0 ? ret : 0; +} + +int bch2_check_backpointers_to_extents(struct bch_fs *c) +{ + struct btree_trans trans; + struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + while (1) { + ret = bch2_get_btree_in_memory_pos(&trans, + (1U << BTREE_ID_extents)| + (1U << BTREE_ID_reflink), + ~0, + start, &end); + if (ret) + break; + + if (!bbpos_cmp(start, BBPOS_MIN) && + bbpos_cmp(end, BBPOS_MAX)) + bch_verbose(c, "check_backpointers_to_extents(): extents do not fit in ram," + "running in multiple passes with %zu nodes per pass", + btree_nodes_fit_in_ram(c)); + + if (bbpos_cmp(start, BBPOS_MIN) || + bbpos_cmp(end, BBPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_backpointers_to_extents(): "); + bch2_bbpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bbpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); + if (ret || !bbpos_cmp(end, BBPOS_MAX)) + break; + + start = bbpos_successor(end); + } + bch2_trans_exit(&trans); + + return ret; +} diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h new file mode 100644 index 0000000..1c97e36 --- /dev/null +++ b/libbcachefs/backpointers.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H + +#include "super.h" + +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, + int, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); + +#define bch2_bkey_ops_backpointer (struct bkey_ops) { \ + .key_invalid = bch2_backpointer_invalid, \ + .val_to_text = bch2_backpointer_k_to_text, \ + .swab = bch2_backpointer_swab, \ +} + +void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned, + struct bkey_s_c, struct extent_ptr_decoded, + struct bpos *, struct bch_backpointer *); + +int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, + u64 *, struct bch_backpointer *, unsigned); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, + struct bpos, u64, struct bch_backpointer); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, + struct bpos, u64, struct bch_backpointer); + +int bch2_check_btree_backpointers(struct bch_fs *); +int bch2_check_extents_to_backpointers(struct bch_fs *); +int bch2_check_backpointers_to_extents(struct bch_fs *); + +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/libbcachefs/bbpos.h b/libbcachefs/bbpos.h new file mode 100644 index 0000000..1fbed1f --- /dev/null +++ b/libbcachefs/bbpos.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_H +#define _BCACHEFS_BBPOS_H + +#include "bkey_methods.h" + +struct bbpos { + enum btree_id btree; + struct bpos pos; +}; + +static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) +{ + return (struct bbpos) { btree, pos }; +} + +#define BBPOS_MIN BBPOS(0, POS_MIN) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) + +static inline int bbpos_cmp(struct bbpos l, struct bbpos r) +{ + return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); +} + +static inline struct bbpos bbpos_successor(struct bbpos pos) +{ + if (bpos_cmp(pos.pos, SPOS_MAX)) { + pos.pos = bpos_successor(pos.pos); + return pos; + } + + if (pos.btree != BTREE_ID_NR) { + pos.btree++; + pos.pos = POS_MIN; + return pos; + } + + BUG(); +} + +static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) +{ + prt_str(out, bch2_btree_ids[pos.btree]); + prt_char(out, ':'); + bch2_bpos_to_text(out, pos.pos); +} + +#endif /* _BCACHEFS_BBPOS_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 0e9689f..33186fa 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -107,7 +107,7 @@ * * BTREE NODES: * - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node @@ -212,6 +212,12 @@ #define dynamic_fault(...) 0 #define race_fault(...) 0 +#define trace_and_count(_c, _name, ...) \ +do { \ + this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + trace_##_name(__VA_ARGS__); \ +} while (0) + #define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) #define bch2_meta_read_fault(name) \ @@ -220,9 +226,11 @@ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ -#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) +#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else +#define bch2_log_msg(_c, fmt) fmt #define bch2_fmt(_c, fmt) fmt "\n" #define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif @@ -329,9 +337,6 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ - x(btree_lock_contended_read) \ - x(btree_lock_contended_intent) \ - x(btree_lock_contended_write) \ x(data_write) \ x(data_read) \ x(data_promote) \ @@ -391,6 +396,10 @@ enum gc_phase { GC_PHASE_BTREE_reflink, GC_PHASE_BTREE_subvolumes, GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, + GC_PHASE_BTREE_backpointers, GC_PHASE_PENDING_DELETE, }; @@ -447,7 +456,7 @@ struct bch_dev { * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets[2]; + struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; @@ -459,34 +468,18 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - struct task_struct __rcu *alloc_thread; + u64 bucket_alloc_trans_early_cursor; - /* - * free: Buckets that are ready to be used - * - * free_inc: Incoming buckets - these are buckets that currently have - * cached data in them, and we can't reuse them until after we write - * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) - */ - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; unsigned nr_open_buckets; + unsigned nr_btree_reserve; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial_nr; - size_t fifo_last_bucket; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; - enum allocator_states allocator_state; - - alloc_heap alloc_heap; - atomic64_t rebalance_work; struct journal_device journal; @@ -507,16 +500,8 @@ struct bch_dev { enum { /* startup: */ - BCH_FS_INITIALIZED, - BCH_FS_ALLOC_READ_DONE, - BCH_FS_ALLOC_CLEAN, - BCH_FS_ALLOCATOR_RUNNING, - BCH_FS_ALLOCATOR_STOPPING, - BCH_FS_INITIAL_GC_DONE, - BCH_FS_INITIAL_GC_UNFIXED, - BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_FSCK_DONE, BCH_FS_STARTED, + BCH_FS_MAY_GO_RW, BCH_FS_RW, BCH_FS_WAS_RW, @@ -524,25 +509,39 @@ enum { BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, + BCH_FS_CLEAN_SHUTDOWN, + + /* fsck passes: */ + BCH_FS_TOPOLOGY_REPAIR_DONE, + BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ + BCH_FS_CHECK_LRUS_DONE, + BCH_FS_CHECK_BACKPOINTERS_DONE, + BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, + + BCH_FS_HAVE_DELETED_SNAPSHOTS, /* errors: */ BCH_FS_ERROR, BCH_FS_TOPOLOGY_ERROR, BCH_FS_ERRORS_FIXED, BCH_FS_ERRORS_NOT_FIXED, - - /* misc: */ - BCH_FS_NEED_ANOTHER_GC, - BCH_FS_DELETED_NODES, - BCH_FS_REBUILD_REPLICAS, - BCH_FS_HOLD_BTREE_WRITES, }; struct btree_debug { unsigned id; - struct dentry *btree; - struct dentry *btree_format; - struct dentry *failed; +}; + +#define BCH_TRANSACTIONS_NR 128 + +struct btree_transaction_stats { + struct mutex lock; + struct time_stats lock_hold_times; + unsigned nr_max_paths; + unsigned max_mem; + char *max_paths_text; }; struct bch_fs_pcpu { @@ -560,17 +559,22 @@ struct journal_seq_blacklist_table { struct journal_keys { struct journal_key { + u64 journal_seq; + u32 journal_offset; enum btree_id btree_id:8; unsigned level:8; bool allocated; bool overwritten; struct bkey_i *k; - u32 journal_seq; - u32 journal_offset; } *d; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; size_t nr; size_t size; - u64 journal_seq_base; }; struct btree_path_buf { @@ -599,6 +603,7 @@ struct bch_fs { struct list_head list; struct kobject kobj; + struct kobject counters_kobj; struct kobject internal; struct kobject opts_dir; struct kobject time_stats; @@ -670,7 +675,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct work_struct snapshot_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work; - struct snapshot_id_list snapshots_unlinked; + snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; /* BTREE CACHE */ @@ -778,6 +783,8 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; + struct work_struct invalidate_work; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; @@ -807,7 +814,6 @@ struct bch_fs { struct mutex gc_gens_lock; /* IO PATH */ - struct semaphore io_in_flight; struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; @@ -836,6 +842,8 @@ struct bch_fs { copygc_heap copygc_heap; struct write_point copygc_write_point; s64 copygc_wait; + bool copygc_running; + wait_queue_head_t copygc_running_wq; /* DATA PROGRESS STATS */ struct list_head data_progress_list; @@ -887,7 +895,8 @@ struct bch_fs { struct bch_memquota_type quotas[QTYP_NR]; /* DEBUG JUNK */ - struct dentry *debug; + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; @@ -905,22 +914,23 @@ struct bch_fs { mempool_t btree_bounce_pool; struct journal journal; - struct list_head journal_entries; + GENRADIX(struct journal_replay *) journal_entries; + u64 journal_entries_base_seq; struct journal_keys journal_keys; struct list_head journal_iters; u64 last_bucket_seq_cleanup; - /* The rest of this all shows up in sysfs */ - atomic_long_t read_realloc_races; - atomic_long_t extent_migrate_done; - atomic_long_t extent_migrate_raced; + u64 counters_on_mount[BCH_COUNTER_NR]; + u64 __percpu *counters; unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; bool promote_whole_extents; struct time_stats times[BCH_TIME_STAT_NR]; + + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; }; static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 5153f0e..bfcb75a 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -78,6 +78,21 @@ #include #include "vstructs.h" +#define BITMASK(name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (k->field >> offset) & ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << (end - offset)) << offset); \ + k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ +} + #define LE_BITMASK(_bits, name, type, field, offset, end) \ static const unsigned name##_OFFSET = offset; \ static const unsigned name##_BITS = (end - offset); \ @@ -321,7 +336,7 @@ static inline void bkey_init(struct bkey *k) * number. * * - WHITEOUT: for hash table btrees -*/ + */ #define BCH_BKEY_TYPES() \ x(deleted, 0) \ x(whiteout, 1) \ @@ -347,7 +362,12 @@ static inline void bkey_init(struct bkey *k) x(subvolume, 21) \ x(snapshot, 22) \ x(inode_v2, 23) \ - x(alloc_v3, 24) + x(alloc_v3, 24) \ + x(set, 25) \ + x(lru, 26) \ + x(alloc_v4, 27) \ + x(backpointer, 28) \ + x(inode_v3, 29) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -377,6 +397,10 @@ struct bch_hash_whiteout { struct bch_val v; }; +struct bch_set { + struct bch_val v; +}; + /* Extents */ /* @@ -617,8 +641,8 @@ union bch_extent_entry { struct bch_btree_ptr { struct bch_val v; - struct bch_extent_ptr start[0]; __u64 _data[0]; + struct bch_extent_ptr start[]; } __attribute__((packed, aligned(8))); struct bch_btree_ptr_v2 { @@ -629,8 +653,8 @@ struct bch_btree_ptr_v2 { __le16 sectors_written; __le16 flags; struct bpos min_key; - struct bch_extent_ptr start[0]; __u64 _data[0]; + struct bch_extent_ptr start[]; } __attribute__((packed, aligned(8))); LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); @@ -638,8 +662,8 @@ LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); struct bch_extent { struct bch_val v; - union bch_extent_entry start[0]; __u64 _data[0]; + union bch_extent_entry start[]; } __attribute__((packed, aligned(8))); struct bch_reservation { @@ -694,6 +718,21 @@ struct bch_inode_v2 { __u8 fields[0]; } __attribute__((packed, aligned(8))); +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[0]; +} __attribute__((packed, aligned(8))); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64)) + struct bch_inode_generation { struct bch_val v; @@ -705,7 +744,7 @@ struct bch_inode_generation { * bi_subvol and bi_parent_subvol are only set for subvolume roots: */ -#define BCH_INODE_FIELDS() \ +#define BCH_INODE_FIELDS_v2() \ x(bi_atime, 96) \ x(bi_ctime, 96) \ x(bi_mtime, 96) \ @@ -732,6 +771,31 @@ struct bch_inode_generation { x(bi_subvol, 32) \ x(bi_parent_subvol, 32) +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ x(data_checksum, 8) \ @@ -757,16 +821,16 @@ enum { * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL * flags) */ - __BCH_INODE_SYNC = 0, - __BCH_INODE_IMMUTABLE = 1, - __BCH_INODE_APPEND = 2, - __BCH_INODE_NODUMP = 3, - __BCH_INODE_NOATIME = 4, + __BCH_INODE_SYNC = 0, + __BCH_INODE_IMMUTABLE = 1, + __BCH_INODE_APPEND = 2, + __BCH_INODE_NODUMP = 3, + __BCH_INODE_NOATIME = 4, - __BCH_INODE_I_SIZE_DIRTY= 5, - __BCH_INODE_I_SECTORS_DIRTY= 6, - __BCH_INODE_UNLINKED = 7, - __BCH_INODE_BACKPTR_UNTRUSTED = 8, + __BCH_INODE_I_SIZE_DIRTY = 5, + __BCH_INODE_I_SECTORS_DIRTY = 6, + __BCH_INODE_UNLINKED = 7, + __BCH_INODE_BACKPTR_UNTRUSTED = 8, /* bits 20+ reserved for packed fields below: */ }; @@ -788,6 +852,13 @@ LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + /* Dirents */ /* @@ -825,10 +896,9 @@ struct bch_dirent { #define DT_SUBVOL 16 #define BCH_DT_MAX 17 -#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ +#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \ sizeof(struct bkey) - \ - offsetof(struct bch_dirent, d_name)) - + offsetof(struct bch_dirent, d_name))) /* Xattrs */ @@ -865,6 +935,12 @@ struct bch_alloc { x(stripe, 32) \ x(stripe_redundancy, 8) +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + struct bch_alloc_v2 { struct bch_val v; __u8 nr_fields; @@ -877,8 +953,8 @@ struct bch_alloc_v2 { #define BCH_ALLOC_FIELDS_V2() \ x(read_time, 64) \ x(write_time, 64) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ x(stripe, 32) \ x(stripe_redundancy, 8) @@ -893,12 +969,43 @@ struct bch_alloc_v3 { __u8 data[]; } __attribute__((packed, aligned(8))); -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x - BCH_ALLOC_FIELD_NR -}; +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; +} __attribute__((packed, aligned(8))); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 + +struct bch_backpointer { + struct bch_val v; + __u8 btree_id; + __u8 level; + __u8 data_type; + __u64 bucket_offset:40; + __u32 bucket_len; + struct bpos pos; +} __attribute__((packed, aligned(8))); /* Quotas: */ @@ -938,7 +1045,7 @@ struct bch_stripe { __u8 csum_type; __u8 pad; - struct bch_extent_ptr ptrs[0]; + struct bch_extent_ptr ptrs[]; } __attribute__((packed, aligned(8))); /* Reflink: */ @@ -1015,6 +1122,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +/* LRU btree: */ + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __attribute__((packed, aligned(8))); + +#define LRU_ID_STRIPES (1U << 16) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1023,16 +1139,18 @@ struct bch_sb_field { __le32 type; }; -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) +#define BCH_SB_FIELDS() \ + x(journal, 0) \ + x(members, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) \ + x(counters, 10) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1041,6 +1159,14 @@ enum bch_sb_field_type { BCH_SB_FIELD_NR }; +/* + * Most superblock fields are replicated in all device's superblocks - a few are + * not: + */ +#define BCH_SINGLE_DEVICE_SB_FIELDS \ + ((1U << BCH_SB_FIELD_journal)| \ + (1U << BCH_SB_FIELD_journal_v2)) + /* BCH_SB_FIELD_journal: */ struct bch_sb_field_journal { @@ -1048,6 +1174,15 @@ struct bch_sb_field_journal { __le64 buckets[0]; }; +struct bch_sb_field_journal_v2 { + struct bch_sb_field field; + + struct bch_sb_field_journal_v2_entry { + __le64 start; + __le64 nr; + } d[0]; +}; + /* BCH_SB_FIELD_members: */ #define BCH_MIN_NR_NBUCKETS (1 << 6) @@ -1069,6 +1204,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags[0], 30, 31) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -1144,13 +1281,16 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); /* BCH_SB_FIELD_replicas: */ #define BCH_DATA_TYPES() \ - x(none, 0) \ + x(free, 0) \ x(sb, 1) \ x(journal, 2) \ x(btree, 3) \ x(user, 4) \ x(cached, 5) \ - x(parity, 6) + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) enum bch_data_type { #define x(t, n) BCH_DATA_##t, @@ -1159,22 +1299,45 @@ enum bch_data_type { BCH_DATA_NR }; +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[0]; + __u8 devs[]; } __attribute__((packed)); struct bch_sb_field_replicas_v0 { struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[0]; + struct bch_replicas_entry_v0 entries[]; } __attribute__((packed, aligned(8))); struct bch_replicas_entry { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[0]; + __u8 devs[]; } __attribute__((packed)); #define replicas_entry_bytes(_i) \ @@ -1220,6 +1383,97 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[0]; } __attribute__((packed, aligned(8))); +/* BCH_SB_FIELD_counters */ + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_race, 38) \ + x(move_extent_alloc_mem_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[0]; +}; + /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -1275,19 +1529,28 @@ struct bch_sb_field_journal_seq_blacklist { #define BCH_JSET_VERSION_OLD 2 #define BCH_BSET_VERSION_OLD 3 +#define BCH_METADATA_VERSIONS() \ + x(bkey_renumber, 10) \ + x(inode_btree_change, 11) \ + x(snapshot, 12) \ + x(inode_backpointers, 13) \ + x(btree_ptr_sectors_written, 14) \ + x(snapshot_2, 15) \ + x(reflink_p_fix, 16) \ + x(subvol_dirent, 17) \ + x(inode_v2, 18) \ + x(freespace, 19) \ + x(alloc_v4, 20) \ + x(new_data_types, 21) \ + x(backpointers, 22) \ + x(inode_v3, 23) + enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, - bcachefs_metadata_version_new_versioning = 10, - bcachefs_metadata_version_bkey_renumber = 10, - bcachefs_metadata_version_inode_btree_change = 11, - bcachefs_metadata_version_snapshot = 12, - bcachefs_metadata_version_inode_backpointers = 13, - bcachefs_metadata_version_btree_ptr_sectors_written = 14, - bcachefs_metadata_version_snapshot_2 = 15, - bcachefs_metadata_version_reflink_p_fix = 16, - bcachefs_metadata_version_subvol_dirent = 17, - bcachefs_metadata_version_inode_v2 = 18, - bcachefs_metadata_version_max = 19, + bcachefs_metadata_version_min = 9, +#define x(t, n) bcachefs_metadata_version_##t = n, + BCH_METADATA_VERSIONS() +#undef x + bcachefs_metadata_version_max }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1427,6 +1690,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +/* Obsolete, always enabled: */ LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); /* @@ -1663,7 +1927,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(data_usage, 6) \ x(clock, 7) \ x(dev_usage, 8) \ - x(log, 9) + x(log, 9) \ + x(overwrite, 10) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1735,7 +2000,7 @@ struct jset_entry_dev_usage { __u32 pad; __le64 buckets_ec; - __le64 buckets_unavailable; + __le64 _buckets_unavailable; /* No longer used */ struct jset_entry_dev_usage_type d[]; } __attribute__((packed)); @@ -1804,7 +2069,11 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(stripes, 6) \ x(reflink, 7) \ x(subvolumes, 8) \ - x(snapshots, 9) + x(snapshots, 9) \ + x(lru, 10) \ + x(freespace, 11) \ + x(need_discard, 12) \ + x(backpointers, 13) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 930981a..b2edabf 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -285,13 +285,14 @@ struct bch_ioctl_dev_usage { __u32 bucket_size; __u64 nr_buckets; - __u64 available_buckets; - __u64 buckets[BCH_DATA_NR]; - __u64 sectors[BCH_DATA_NR]; + __u64 buckets_ec; - __u64 ec_buckets; - __u64 ec_sectors; + struct bch_ioctl_dev_usage_type { + __u64 buckets; + __u64 sectors; + __u64 fragmented; + } d[BCH_DATA_NR]; }; /* diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 946dd27..f7e5d0c 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey.h" +#include "bkey_cmp.h" #include "bkey_methods.h" #include "bset.h" #include "util.h" @@ -19,33 +20,49 @@ const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, const struct bkey_packed *); -void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) +void bch2_bkey_packed_to_binary_text(struct printbuf *out, + const struct bkey_format *f, + const struct bkey_packed *k) { - unsigned bit = high_bit_offset, done = 0; + const u64 *p = high_word(f, k); + unsigned word_bits = 64 - high_bit_offset; + unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; + u64 v = *p & (~0ULL >> high_bit_offset); + + if (!nr_key_bits) { + prt_str(out, "(empty)"); + return; + } while (1) { - while (bit < 64) { - if (done && !(done % 8)) - *out++ = ' '; - *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; - bit++; - done++; - if (done == nr_bits) { - *out++ = '\0'; - return; - } + unsigned next_key_bits = nr_key_bits; + + if (nr_key_bits < 64) { + v >>= 64 - nr_key_bits; + next_key_bits = 0; + } else { + next_key_bits -= 64; } + bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + + if (!next_key_bits) + break; + + prt_char(out, ' '); + p = next_word(p); - bit = 0; + v = *p; + word_bits = 64; + nr_key_bits = next_key_bits; } } #ifdef CONFIG_BCACHEFS_DEBUG static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) + const struct bkey *unpacked, + const struct bkey_format *format) { struct bkey tmp; @@ -57,22 +74,35 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, tmp = __bch2_bkey_unpack_key(format, packed); if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - char buf1[160], buf2[160]; - char buf3[160], buf4[160]; - - bch2_bkey_to_text(&PBUF(buf1), unpacked); - bch2_bkey_to_text(&PBUF(buf2), &tmp); - bch2_to_binary(buf3, (void *) unpacked, 80); - bch2_to_binary(buf4, high_word(format, packed), 80); + struct printbuf buf = PRINTBUF; - panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", + prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", format->key_u64s, format->bits_per_field[0], format->bits_per_field[1], format->bits_per_field[2], format->bits_per_field[3], - format->bits_per_field[4], - buf1, buf2, buf3, buf4); + format->bits_per_field[4]); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_to_text(&buf, unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_to_text(&buf, &tmp); + prt_newline(&buf); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) &tmp); + prt_newline(&buf); + + panic("%s", buf.buf); } } @@ -201,9 +231,10 @@ static bool bch2_bkey_transform_key(const struct bkey_format *out_f, { struct pack_state out_s = pack_state_init(out_f, out); struct unpack_state in_s = unpack_state_init(in_f, in); + u64 *w = out->_data; unsigned i; - out->_data[0] = 0; + *w = 0; for (i = 0; i < BKEY_NR_FIELDS; i++) if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) @@ -292,12 +323,13 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, const struct bkey_format *format) { struct pack_state state = pack_state_init(format, out); + u64 *w = out->_data; EBUG_ON((void *) in == (void *) out); EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); EBUG_ON(in->format != KEY_FORMAT_CURRENT); - out->_data[0] = 0; + *w = 0; #define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; bkey_fields() @@ -439,6 +471,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, { const struct bkey_format *f = &b->format; struct pack_state state = pack_state_init(f, out); + u64 *w = out->_data; #ifdef CONFIG_BCACHEFS_DEBUG struct bpos orig = in; #endif @@ -451,7 +484,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, * enough - we need to make sure to zero them out: */ for (i = 0; i < f->key_u64s; i++) - out->_data[i] = 0; + w[i] = 0; if (unlikely(in.snapshot < le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { @@ -731,50 +764,6 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) #ifdef CONFIG_X86_64 -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - long d0, d1, d2, d3; - int cmp; - - /* we shouldn't need asm for this, but gcc is being retarded: */ - - asm(".intel_syntax noprefix;" - "xor eax, eax;" - "xor edx, edx;" - "1:;" - "mov r8, [rdi];" - "mov r9, [rsi];" - "sub ecx, 64;" - "jl 2f;" - - "cmp r8, r9;" - "jnz 3f;" - - "lea rdi, [rdi - 8];" - "lea rsi, [rsi - 8];" - "jmp 1b;" - - "2:;" - "not ecx;" - "shr r8, 1;" - "shr r9, 1;" - "shr r8, cl;" - "shr r9, cl;" - "cmp r8, r9;" - - "3:\n" - "seta al;" - "setb dl;" - "sub eax, edx;" - ".att_syntax prefix;" - : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) - : "0" (l), "1" (r), "3" (nr_key_bits) - : "r8", "r9", "cc", "memory"); - - return cmp; -} - #define I(_x) (*(out)++ = (_x)) #define I1(i0) I(i0) #define I2(i0, i1) (I1(i0), I(i1)) @@ -1005,40 +994,6 @@ int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) } #else -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - u64 l_v, r_v; - - if (!nr_key_bits) - return 0; - - /* for big endian, skip past header */ - nr_key_bits += high_bit_offset; - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (1) { - if (nr_key_bits < 64) { - l_v >>= 64 - nr_key_bits; - r_v >>= 64 - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= 64; - } - - if (!nr_key_bits || l_v != r_v) - break; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - } - - return cmp_int(l_v, r_v); -} #endif __pure @@ -1046,19 +1001,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, const struct bkey_packed *r, const struct btree *b) { - const struct bkey_format *f = &b->format; - int ret; - - EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - ret = __bkey_cmp_bits(high_word(f, l), - high_word(f, r), - b->nr_key_bits); - - EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), - bkey_unpack_pos(b, r))); - return ret; + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); } __pure __flatten @@ -1074,20 +1017,7 @@ int bch2_bkey_cmp_packed(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { - struct bkey unpacked; - - if (likely(bkey_packed(l) && bkey_packed(r))) - return __bch2_bkey_cmp_packed_format_checked(l, r, b); - - if (bkey_packed(l)) { - __bkey_unpack_key_format_checked(b, &unpacked, l); - l = (void*) &unpacked; - } else if (bkey_packed(r)) { - __bkey_unpack_key_format_checked(b, &unpacked, r); - r = (void*) &unpacked; - } - - return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + return bch2_bkey_cmp_packed_inlined(b, l, r); } __pure __flatten diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 7dee3d8..19b59ff 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -5,6 +5,7 @@ #include #include "bcachefs_format.h" +#include "btree_types.h" #include "util.h" #include "vstructs.h" @@ -12,7 +13,9 @@ #define HAVE_BCACHEFS_COMPILED_UNPACK 1 #endif -void bch2_to_binary(char *, const u64 *, unsigned); +void bch2_bkey_packed_to_binary_text(struct printbuf *, + const struct bkey_format *, + const struct bkey_packed *); /* bkey with split value, const */ struct bkey_s_c { @@ -42,12 +45,15 @@ static inline size_t bkey_val_bytes(const struct bkey *k) static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) { - k->u64s = BKEY_U64s + val_u64s; + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; } static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) { - k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); } #define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) @@ -129,8 +135,9 @@ int bkey_cmp_left_packed(const struct btree *b, } /* - * we prefer to pass bpos by ref, but it's often enough terribly convenient to - * pass it by by val... as much as I hate c++, const ref would be nice here: + * The compiler generates better code when we pass bpos by ref, but it's often + * enough terribly convenient to pass it by val... as much as I hate c++, const + * ref would be nice here: */ __pure __flatten static inline int bkey_cmp_left_packed_byval(const struct btree *b, @@ -351,6 +358,99 @@ void bch2_bkey_unpack(const struct btree *, struct bkey_i *, bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, const struct bkey_format *); +typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); + +static inline void +__bkey_unpack_key_format_checked(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + bch2_expensive_debug_checks) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); + } + } else { + *dst = __bch2_bkey_unpack_key(&b->format, src); + } +} + +static inline struct bkey +bkey_unpack_key_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ + struct bkey dst; + + __bkey_unpack_key_format_checked(b, &dst, src); + return dst; +} + +static inline void __bkey_unpack_key(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (likely(bkey_packed(src))) + __bkey_unpack_key_format_checked(b, dst, src); + else + *dst = *packed_to_bkey_c(src); +} + +/** + * bkey_unpack_key -- unpack just the key, not the value + */ +static inline struct bkey bkey_unpack_key(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_key_format_checked(b, src) + : *packed_to_bkey_c(src); +} + +static inline struct bpos +bkey_unpack_pos_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + return bkey_unpack_key_format_checked(b, src).p; +#else + return __bkey_unpack_pos(&b->format, src); +#endif +} + +static inline struct bpos bkey_unpack_pos(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_pos_format_checked(b, src) + : packed_to_bkey_c(src)->p; +} + +/* Disassembled bkeys */ + +static inline struct bkey_s_c bkey_disassemble(struct btree *b, + const struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; +} + +/* non const version: */ +static inline struct bkey_s __bkey_disassemble(struct btree *b, + struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; +} + static inline u64 bkey_field_max(const struct bkey_format *f, enum bch_bkey_fields nr) { diff --git a/libbcachefs/bkey_buf.h b/libbcachefs/bkey_buf.h index 0d7c67a..a30c4ae 100644 --- a/libbcachefs/bkey_buf.h +++ b/libbcachefs/bkey_buf.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BKEY_BUF_H #include "bcachefs.h" +#include "bkey.h" struct bkey_buf { struct bkey_i *k; diff --git a/libbcachefs/bkey_cmp.h b/libbcachefs/bkey_cmp.h new file mode 100644 index 0000000..5f42a6e --- /dev/null +++ b/libbcachefs/bkey_cmp.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_CMP_H +#define _BCACHEFS_BKEY_CMP_H + +#include "bkey.h" + +#ifdef CONFIG_X86_64 +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + long d0, d1, d2, d3; + int cmp; + + /* we shouldn't need asm for this, but gcc is being retarded: */ + + asm(".intel_syntax noprefix;" + "xor eax, eax;" + "xor edx, edx;" + "1:;" + "mov r8, [rdi];" + "mov r9, [rsi];" + "sub ecx, 64;" + "jl 2f;" + + "cmp r8, r9;" + "jnz 3f;" + + "lea rdi, [rdi - 8];" + "lea rsi, [rsi - 8];" + "jmp 1b;" + + "2:;" + "not ecx;" + "shr r8, 1;" + "shr r9, 1;" + "shr r8, cl;" + "shr r9, cl;" + "cmp r8, r9;" + + "3:\n" + "seta al;" + "setb dl;" + "sub eax, edx;" + ".att_syntax prefix;" + : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) + : "0" (l), "1" (r), "3" (nr_key_bits) + : "r8", "r9", "cc", "memory"); + + return cmp; +} +#else +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + u64 l_v, r_v; + + if (!nr_key_bits) + return 0; + + /* for big endian, skip past header */ + nr_key_bits += high_bit_offset; + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (1) { + if (nr_key_bits < 64) { + l_v >>= 64 - nr_key_bits; + r_v >>= 64 - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= 64; + } + + if (!nr_key_bits || l_v != r_v) + break; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + } + + return cmp_int(l_v, r_v); +} +#endif + +static inline __pure __flatten +int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + int ret; + + EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + ret = __bkey_cmp_bits(high_word(f, l), + high_word(f, r), + b->nr_key_bits); + + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), + bkey_unpack_pos(b, r))); + return ret; +} + +static inline __pure __flatten +int bch2_bkey_cmp_packed_inlined(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + struct bkey unpacked; + + if (likely(bkey_packed(l) && bkey_packed(r))) + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); + + if (bkey_packed(l)) { + __bkey_unpack_key_format_checked(b, &unpacked, l); + l = (void *) &unpacked; + } else if (bkey_packed(r)) { + __bkey_unpack_key_format_checked(b, &unpacked, r); + r = (void *) &unpacked; + } + + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); +} + +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index e83aeb6..14d910a 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "backpointers.h" #include "bkey_methods.h" #include "btree_types.h" #include "alloc_background.h" @@ -9,6 +10,7 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "lru.h" #include "quota.h" #include "reflink.h" #include "subvolume.h" @@ -21,10 +23,10 @@ const char * const bch2_bkey_types[] = { NULL }; -static const char *deleted_key_invalid(const struct bch_fs *c, - struct bkey_s_c k) +static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - return NULL; + return 0; } #define bch2_bkey_ops_deleted (struct bkey_ops) { \ @@ -35,25 +37,32 @@ static const char *deleted_key_invalid(const struct bch_fs *c, .key_invalid = deleted_key_invalid, \ } -static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) +static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_val_bytes(k.k)) - return "value size should be zero"; + if (bkey_val_bytes(k.k)) { + prt_printf(err, "incorrect value size (%zu != 0)", + bkey_val_bytes(k.k)); + return -EINVAL; + } - return NULL; + return 0; } #define bch2_bkey_ops_error (struct bkey_ops) { \ .key_invalid = empty_val_key_invalid, \ } -static const char *key_type_cookie_invalid(const struct bch_fs *c, - struct bkey_s_c k) +static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_cookie)); + return -EINVAL; + } - return NULL; + return 0; } #define bch2_bkey_ops_cookie (struct bkey_ops) { \ @@ -64,10 +73,10 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, .key_invalid = empty_val_key_invalid, \ } -static const char *key_type_inline_data_invalid(const struct bch_fs *c, - struct bkey_s_c k) +static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - return NULL; + return 0; } static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, @@ -76,7 +85,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); unsigned datalen = bkey_inline_data_bytes(k.k); - pr_buf(out, "datalen %u: %*phN", + prt_printf(out, "datalen %u: %*phN", datalen, min(datalen, 32U), d.v->data); } @@ -85,18 +94,44 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ } +static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + if (bkey_val_bytes(k.k)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_cookie)); + return -EINVAL; + } + + return 0; +} + +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +#define bch2_bkey_ops_set (struct bkey_ops) { \ + .key_invalid = key_type_set_invalid, \ + .key_merge = key_type_set_merge, \ +} + const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x }; -const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) +int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (k.k->type >= KEY_TYPE_MAX) - return "invalid type"; + if (k.k->type >= KEY_TYPE_MAX) { + prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); + return -EINVAL; + } - return bch2_bkey_ops[k.k->type].key_invalid(c, k); + return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err); } static unsigned bch2_key_types_allowed[] = { @@ -114,6 +149,7 @@ static unsigned bch2_key_types_allowed[] = { (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_inode)| (1U << KEY_TYPE_inode_v2)| + (1U << KEY_TYPE_inode_v3)| (1U << KEY_TYPE_inode_generation), [BKEY_TYPE_dirents] = (1U << KEY_TYPE_deleted)| @@ -130,7 +166,8 @@ static unsigned bch2_key_types_allowed[] = { (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_alloc)| (1U << KEY_TYPE_alloc_v2)| - (1U << KEY_TYPE_alloc_v3), + (1U << KEY_TYPE_alloc_v3)| + (1U << KEY_TYPE_alloc_v4), [BKEY_TYPE_quotas] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_quota), @@ -147,112 +184,145 @@ static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_snapshots] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_lru] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_lru), + [BKEY_TYPE_freespace] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_need_discard] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_backpointers] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_backpointer), [BKEY_TYPE_btree] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| (1U << KEY_TYPE_btree_ptr_v2), }; -const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type) +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + int rw, struct printbuf *err) { - if (k.k->u64s < BKEY_U64s) - return "u64s too small"; - - if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) - return "invalid key type for this btree"; + if (k.k->u64s < BKEY_U64s) { + prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); + return -EINVAL; + } - if (type == BKEY_TYPE_btree && - bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { + prt_printf(err, "invalid key type for btree %s (%s)", + bch2_btree_ids[type], bch2_bkey_types[type]); + return -EINVAL; + } if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - if (k.k->size == 0) - return "bad size field"; + if (k.k->size == 0) { + prt_printf(err, "size == 0"); + return -EINVAL; + } - if (k.k->size > k.k->p.offset) - return "size greater than offset"; + if (k.k->size > k.k->p.offset) { + prt_printf(err, "size greater than offset (%u > %llu)", + k.k->size, k.k->p.offset); + return -EINVAL; + } } else { - if (k.k->size) - return "nonzero size field"; + if (k.k->size) { + prt_printf(err, "size != 0"); + return -EINVAL; + } } if (type != BKEY_TYPE_btree && !btree_type_has_snapshots(type) && - k.k->p.snapshot) - return "nonzero snapshot"; + k.k->p.snapshot) { + prt_printf(err, "nonzero snapshot"); + return -EINVAL; + } if (type != BKEY_TYPE_btree && btree_type_has_snapshots(type) && - !k.k->p.snapshot) - return "invalid snapshot field"; + !k.k->p.snapshot) { + prt_printf(err, "snapshot == 0"); + return -EINVAL; + } if (type != BKEY_TYPE_btree && - !bkey_cmp(k.k->p, POS_MAX)) - return "POS_MAX key"; + !bkey_cmp(k.k->p, POS_MAX)) { + prt_printf(err, "key at POS_MAX"); + return -EINVAL; + } - return NULL; + return 0; } -const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type) +int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + int rw, struct printbuf *err) { - return __bch2_bkey_invalid(c, k, type) ?: - bch2_bkey_val_invalid(c, k); + return __bch2_bkey_invalid(c, k, type, rw, err) ?: + bch2_bkey_val_invalid(c, k, rw, err); } -const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) +int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, + struct printbuf *err) { - if (bpos_cmp(k.k->p, b->data->min_key) < 0) - return "key before start of btree node"; + if (bpos_cmp(k.k->p, b->data->min_key) < 0) { + prt_printf(err, "key before start of btree node"); + return -EINVAL; + } - if (bpos_cmp(k.k->p, b->data->max_key) > 0) - return "key past end of btree node"; + if (bpos_cmp(k.k->p, b->data->max_key) > 0) { + prt_printf(err, "key past end of btree node"); + return -EINVAL; + } - return NULL; + return 0; } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { if (!bpos_cmp(pos, POS_MIN)) - pr_buf(out, "POS_MIN"); + prt_printf(out, "POS_MIN"); else if (!bpos_cmp(pos, POS_MAX)) - pr_buf(out, "POS_MAX"); + prt_printf(out, "POS_MAX"); else if (!bpos_cmp(pos, SPOS_MAX)) - pr_buf(out, "SPOS_MAX"); + prt_printf(out, "SPOS_MAX"); else { if (pos.inode == U64_MAX) - pr_buf(out, "U64_MAX"); + prt_printf(out, "U64_MAX"); else - pr_buf(out, "%llu", pos.inode); - pr_buf(out, ":"); + prt_printf(out, "%llu", pos.inode); + prt_printf(out, ":"); if (pos.offset == U64_MAX) - pr_buf(out, "U64_MAX"); + prt_printf(out, "U64_MAX"); else - pr_buf(out, "%llu", pos.offset); - pr_buf(out, ":"); + prt_printf(out, "%llu", pos.offset); + prt_printf(out, ":"); if (pos.snapshot == U32_MAX) - pr_buf(out, "U32_MAX"); + prt_printf(out, "U32_MAX"); else - pr_buf(out, "%u", pos.snapshot); + prt_printf(out, "%u", pos.snapshot); } } void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { if (k) { - pr_buf(out, "u64s %u type ", k->u64s); + prt_printf(out, "u64s %u type ", k->u64s); if (k->type < KEY_TYPE_MAX) - pr_buf(out, "%s ", bch2_bkey_types[k->type]); + prt_printf(out, "%s ", bch2_bkey_types[k->type]); else - pr_buf(out, "%u ", k->type); + prt_printf(out, "%u ", k->type); bch2_bpos_to_text(out, k->p); - pr_buf(out, " len %u ver %llu", k->size, k->version.lo); + prt_printf(out, " len %u ver %llu", k->size, k->version.lo); } else { - pr_buf(out, "(null)"); + prt_printf(out, "(null)"); } } @@ -265,7 +335,7 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, if (likely(ops->val_to_text)) ops->val_to_text(out, c, k); } else { - pr_buf(out, "(invalid type %u)", k.k->type); + prt_printf(out, "(invalid type %u)", k.k->type); } } @@ -275,7 +345,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_to_text(out, k.k); if (bkey_val_bytes(k.k)) { - pr_buf(out, ": "); + prt_printf(out, ": "); bch2_val_to_text(out, c, k); } } diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 4fdac54..db894b4 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -6,20 +6,31 @@ struct bch_fs; struct btree; +struct btree_trans; struct bkey; enum btree_node_type; extern const char * const bch2_bkey_types[]; +/* + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * invalid, entire key will be deleted. + * + * When invalid, error string is returned via @err. @rw indicates whether key is + * being read or written; more aggressive checks can be enabled when rw == WRITE. +*/ struct bkey_ops { - /* Returns reason for being invalid if invalid, else NULL: */ - const char * (*key_invalid)(const struct bch_fs *, - struct bkey_s_c); + int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); + int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -27,12 +38,12 @@ struct bkey_ops { extern const struct bkey_ops bch2_bkey_ops[]; -const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); -const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type); -const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type); -const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type, int, struct printbuf *); +int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type, int, struct printbuf *); +int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); @@ -57,6 +68,92 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +static inline int bch2_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, + unsigned flags) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; + + return ops->atomic_trigger + ? ops->atomic_trigger(trans, old, new, flags) + : 0; +} + +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, + __BTREE_TRIGGER_NOATOMIC, +}; + +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + +#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) + +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + +#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_alloc_v4)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) + +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; + + return ops->trans_trigger + ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + : 0; +} + +static inline int bch2_trans_mark_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = old.k->p; + + return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, + BTREE_TRIGGER_OVERWRITE|flags); +} + +static inline int bch2_trans_mark_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_i *new, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = new->k.p; + + return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); +} + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index b1385a7..8518054 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_buf.h" +#include "bkey_cmp.h" #include "bkey_sort.h" #include "bset.h" #include "extents.h" @@ -155,7 +156,7 @@ static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bch2_bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed_inlined(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (int) l->needs_whiteout - (int) r->needs_whiteout; } diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 6000a87..0942353 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -70,7 +70,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, struct bkey_packed *_k, *_n; struct bkey uk, n; struct bkey_s_c k; - char buf[200]; + struct printbuf buf = PRINTBUF; if (!i->u64s) return; @@ -81,12 +81,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, _n = bkey_next(_k); k = bkey_disassemble(b, _k, &uk); + + printbuf_reset(&buf); if (c) - bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_bkey_val_to_text(&buf, c, k); else - bch2_bkey_to_text(&PBUF(buf), k.k); + bch2_bkey_to_text(&buf, k.k); printk(KERN_ERR "block %u key %5zu: %s\n", set, - _k->_data - i->_data, buf); + _k->_data - i->_data, buf.buf); if (_n == vstruct_last(i)) continue; @@ -102,6 +104,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, !bpos_cmp(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } + + printbuf_exit(&buf); } void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) @@ -118,6 +122,7 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_node_iter *iter) { struct btree_node_iter_set *set; + struct printbuf buf = PRINTBUF; printk(KERN_ERR "btree node iter with %u/%u sets:\n", __btree_node_iter_used(iter), b->nsets); @@ -126,12 +131,14 @@ void bch2_dump_btree_node_iter(struct btree *b, struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); struct bset_tree *t = bch2_bkey_to_bset(b, k); struct bkey uk = bkey_unpack_key(b, k); - char buf[100]; - bch2_bkey_to_text(&PBUF(buf), &uk); + printbuf_reset(&buf); + bch2_bkey_to_text(&buf, &uk); printk(KERN_ERR "set %zu key %u: %s\n", - t - b->set, set->k, buf); + t - b->set, set->k, buf.buf); } + + printbuf_exit(&buf); } #ifdef CONFIG_BCACHEFS_DEBUG @@ -167,13 +174,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree_node_iter_set *set; struct bkey ku = bkey_unpack_key(b, k); struct bkey nu = bkey_unpack_key(b, n); - char buf1[80], buf2[80]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &ku); - bch2_bkey_to_text(&PBUF(buf2), &nu); + bch2_bkey_to_text(&buf1, &ku); + bch2_bkey_to_text(&buf2, &nu); printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", - buf1, buf2); + buf1.buf, buf2.buf); printk(KERN_ERR "iter was:"); btree_node_iter_for_each(_iter, set) { @@ -238,6 +246,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); struct bkey_packed *next = (void *) (where->_data + clobber_u64s); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; #if 0 BUG_ON(prev && bkey_iter_cmp(b, prev, insert) > 0); @@ -246,17 +256,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bkey_iter_cmp(b, prev, insert) > 0) { struct bkey k1 = bkey_unpack_key(b, prev); struct bkey k2 = bkey_unpack_key(b, insert); - char buf1[100]; - char buf2[100]; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &k1); - bch2_bkey_to_text(&PBUF(buf2), &k2); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); panic("prev > insert:\n" "prev key %s\n" "insert key %s\n", - buf1, buf2); + buf1.buf, buf2.buf); } #endif #if 0 @@ -267,17 +275,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bkey_iter_cmp(b, insert, next) > 0) { struct bkey k1 = bkey_unpack_key(b, insert); struct bkey k2 = bkey_unpack_key(b, next); - char buf1[100]; - char buf2[100]; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &k1); - bch2_bkey_to_text(&PBUF(buf2), &k2); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); panic("insert > next:\n" "insert key %s\n" "next key %s\n", - buf1, buf2); + buf1.buf, buf2.buf); } #endif } @@ -959,7 +965,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, t->size -= j - l; for (j = l; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; + rw_aux_tree(b, t)[j].offset += shift; EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset == @@ -1260,7 +1266,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter, bch2_btree_node_iter_sort(iter, b); } -noinline __flatten __attribute__((cold)) +noinline __flatten __cold static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { @@ -1435,7 +1441,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, EBUG_ON(iter->data->k > iter->data->end); if (unlikely(__btree_node_iter_set_end(iter, 0))) { - bch2_btree_node_iter_set_drop(iter, iter->data); + /* avoid an expensive memmove call: */ + iter->data[0] = iter->data[1]; + iter->data[1] = iter->data[2]; + iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; return; } @@ -1567,9 +1576,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, struct bkey uk; unsigned j, inorder; - if (out->pos != out->end) - *out->pos = '\0'; - if (!bset_has_ro_aux_tree(t)) return; @@ -1584,12 +1590,12 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, switch (bkey_float(b, t, j)->exponent) { case BFLOAT_FAILED: uk = bkey_unpack_key(b, k); - pr_buf(out, + prt_printf(out, " failed unpacked at depth %u\n" "\t", ilog2(j)); bch2_bpos_to_text(out, uk.p); - pr_buf(out, "\n"); + prt_printf(out, "\n"); break; } } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 0d46534..72e6376 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -205,100 +205,6 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) return btree_aux_data_bytes(b) / sizeof(u64); } -typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); - -static inline void -__bkey_unpack_key_format_checked(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - { - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(dst, src); - - if (bch2_expensive_debug_checks) { - struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); - } - } -#else - *dst = __bch2_bkey_unpack_key(&b->format, src); -#endif -} - -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ - struct bkey dst; - - __bkey_unpack_key_format_checked(b, &dst, src); - return dst; -} - -static inline void __bkey_unpack_key(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (likely(bkey_packed(src))) - __bkey_unpack_key_format_checked(b, dst, src); - else - *dst = *packed_to_bkey_c(src); -} - -/** - * bkey_unpack_key -- unpack just the key, not the value - */ -static inline struct bkey bkey_unpack_key(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_key_format_checked(b, src) - : *packed_to_bkey_c(src); -} - -static inline struct bpos -bkey_unpack_pos_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - return bkey_unpack_key_format_checked(b, src).p; -#else - return __bkey_unpack_pos(&b->format, src); -#endif -} - -static inline struct bpos bkey_unpack_pos(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_pos_format_checked(b, src) - : packed_to_bkey_c(src)->p; -} - -/* Disassembled bkeys */ - -static inline struct bkey_s_c bkey_disassemble(struct btree *b, - const struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -} - -/* non const version: */ -static inline struct bkey_s __bkey_disassemble(struct btree *b, - struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -} - #define for_each_bset(_b, _t) \ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 986d08d..8dd2db4 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -7,13 +7,25 @@ #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" +#include "errcode.h" #include "error.h" #include #include #include -struct lock_class_key bch2_btree_node_lock_key; +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed_##counter++; \ +} while (0) + +const char * const bch2_btree_node_flags[] = { +#define x(f) #f, + BTREE_FLAGS() +#undef x + NULL +}; void bch2_recalc_btree_reserve(struct bch_fs *c) { @@ -35,6 +47,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) return max_t(int, 0, bc->used - bc->reserve); } +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) +{ + if (b->c.lock.readers) + list_move(&b->list, &bc->freed_pcpu); + else + list_move(&b->list, &bc->freed_nonpcpu); +} + static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; @@ -51,7 +71,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) b->aux_data = NULL; bc->used--; - list_move(&b->list, &bc->freed); + + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -95,14 +116,17 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) return 0; } -static struct btree *__btree_node_mem_alloc(struct bch_fs *c) +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) { - struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); + struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) return NULL; bkey_btree_ptr_init(&b->key); __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_set_no_check_recursion(&b->c.lock.dep_map); +#endif INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); b->byte_order = ilog2(btree_bytes(c)); @@ -112,7 +136,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b = __btree_node_mem_alloc(c); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -135,8 +159,6 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; - - six_lock_wakeup_all(&b->c.lock); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -156,15 +178,10 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, b->c.level = level; b->c.btree_id = id; - if (level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free_rcu(&b->c.lock); - mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); if (!ret) - list_add(&b->list, &bc->live); + list_add_tail(&b->list, &bc->live); mutex_unlock(&bc->lock); return ret; @@ -183,7 +200,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -193,40 +210,64 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); return -ENOMEM; + } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); return -ENOMEM; + } - if (!six_trylock_write(&b->c.lock)) + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); goto out_unlock_intent; + } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); goto out_unlock; + } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b)) + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); goto out_unlock; - - if (!btree_node_may_write(b)) + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); + goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); goto out_unlock; + } if (btree_node_dirty(b)) { - if (!flush || - test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); goto out_unlock; + } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -234,9 +275,9 @@ wait_on_io: * the post write cleanup: */ if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent); + bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); else - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, 0); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -244,7 +285,7 @@ wait_on_io: } out: if (b->hash_val && !ret) - trace_btree_node_reap(c, b); + trace_and_count(c, btree_cache_reap, c, b); return ret; out_unlock: six_unlock_write(&b->c.lock); @@ -254,14 +295,14 @@ out_unlock_intent: goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) { - return __btree_node_reclaim(c, b, false); + return __btree_node_reclaim(c, b, false, shrinker_counter); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true); + return __btree_node_reclaim(c, b, true, false); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -272,21 +313,18 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; unsigned long nr = sc->nr_to_scan; - unsigned long can_free; - unsigned long touched = 0; + unsigned long can_free = 0; unsigned long freed = 0; + unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; + bool trigger_writes = atomic_read(&bc->dirty) + nr >= + bc->used * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - goto out_norestore; - + mutex_lock(&bc->lock); flags = memalloc_nofs_save(); /* @@ -296,7 +334,6 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - nr /= btree_pages(c); can_free = btree_cache_can_free(bc); nr = min_t(unsigned long, nr, can_free); @@ -312,61 +349,61 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, touched++; if (touched >= nr) - break; + goto out; - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, true)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->freed++; } } restart: list_for_each_entry_safe(b, t, &bc->live, list) { touched++; - if (touched >= nr) { - /* Save position */ - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); - break; - } - - if (!btree_node_accessed(b) && - !btree_node_reclaim(c, b)) { - /* can't call bch2_btree_node_hash_remove under lock */ + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); + bc->not_freed_access_bit++; + } else if (!btree_node_reclaim(c, b, true)) { freed++; - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); - btree_node_data_free(c, b); - mutex_unlock(&bc->lock); + bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - if (freed >= nr) - goto out; - - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - goto out; + if (freed == nr) + goto out_rotate; + } else if (trigger_writes && + btree_node_dirty(b) && + !btree_node_will_make_reachable(b) && + !btree_node_write_blocked(b) && + six_trylock_read(&b->c.lock)) { + list_move(&bc->live, &b->list); + mutex_unlock(&bc->lock); + __bch2_btree_node_write(c, b, 0); + six_unlock_read(&b->c.lock); + if (touched >= nr) + goto out_nounlock; + mutex_lock(&bc->lock); goto restart; - } else - clear_btree_node_accessed(b); - } + } - mutex_unlock(&bc->lock); + if (touched >= nr) + break; + } +out_rotate: + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); out: - ret = (unsigned long) freed * btree_pages(c); + mutex_unlock(&bc->lock); +out_nounlock: + ret = freed; memalloc_nofs_restore(flags); -out_norestore: - trace_btree_cache_scan(sc->nr_to_scan, - sc->nr_to_scan / btree_pages(c), - btree_cache_can_free(bc), - ret); + trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); return ret; } @@ -380,7 +417,15 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc) * btree_pages(c); + return btree_cache_can_free(bc); +} + +static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache.shrink); + + bch2_btree_cache_to_text(out, &c->btree_cache); } void bch2_fs_btree_cache_exit(struct bch_fs *c) @@ -415,15 +460,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(c, b); + clear_btree_node_dirty_acct(c, b); btree_node_data_free(c, b); } BUG_ON(atomic_read(&c->btree_cache.dirty)); - while (!list_empty(&bc->freed)) { - b = list_first_entry(&bc->freed, struct btree, list); + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + while (!list_empty(&bc->freed_nonpcpu)) { + b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); six_lock_pcpu_free(&b->c.lock); kfree(b); @@ -464,9 +511,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.count_objects = bch2_btree_cache_count; bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; bc->shrink.seeks = 4; - bc->shrink.batch = btree_pages(c) * 2; - ret = register_shrinker(&bc->shrink); + ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; @@ -477,7 +524,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) mutex_init(&bc->lock); INIT_LIST_HEAD(&bc->live); INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed); + INIT_LIST_HEAD(&bc->freed_pcpu); + INIT_LIST_HEAD(&bc->freed_nonpcpu); } /* @@ -491,7 +539,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_btree_node_cannibalize_unlock(c); + trace_and_count(c, btree_cache_cannibalize_unlock, c); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } @@ -507,7 +555,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_btree_node_cannibalize_lock_fail(c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); return -ENOMEM; } @@ -521,11 +569,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_btree_node_cannibalize_lock_fail(c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); return -EAGAIN; success: - trace_btree_node_cannibalize_lock(c); + trace_and_count(c, btree_cache_cannibalize_lock, c); return 0; } @@ -535,7 +583,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) + if (!btree_node_reclaim(c, b, false)) return b; while (1) { @@ -552,55 +600,68 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) } } -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; + struct list_head *freed = pcpu_read_locks + ? &bc->freed_pcpu + : &bc->freed_nonpcpu; + struct btree *b, *b2; u64 start_time = local_clock(); unsigned flags; flags = memalloc_nofs_save(); mutex_lock(&bc->lock); - /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b, &bc->freeable, list) - if (!btree_node_reclaim(c, b)) - goto got_node; - /* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ - list_for_each_entry(b, &bc->freed, list) - if (!btree_node_reclaim(c, b)) + list_for_each_entry(b, freed, list) + if (!btree_node_reclaim(c, b, false)) { + list_del_init(&b->list); goto got_node; + } - b = NULL; -got_node: - if (b) - list_del_init(&b->list); - mutex_unlock(&bc->lock); - + b = __btree_node_mem_alloc(c, __GFP_NOWARN); if (!b) { - b = __btree_node_mem_alloc(c); + mutex_unlock(&bc->lock); + b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; - - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); + mutex_lock(&bc->lock); } - if (!b->data) { - if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) - goto err; + if (pcpu_read_locks) + six_lock_pcpu_alloc(&b->c.lock); - mutex_lock(&bc->lock); - bc->used++; - mutex_unlock(&bc->lock); - } + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); +got_node: + + /* + * btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) + if (!btree_node_reclaim(c, b2, false)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + goto got_mem; + } + + mutex_unlock(&bc->lock); + + if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + goto err; + + mutex_lock(&bc->lock); + bc->used++; +got_mem: + mutex_unlock(&bc->lock); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); @@ -623,21 +684,25 @@ out: err: mutex_lock(&bc->lock); - if (b) { - list_add(&b->list, &bc->freed); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - } - /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { - b = btree_node_cannibalize(c); - list_del_init(&b->list); - mutex_unlock(&bc->lock); + b2 = btree_node_cannibalize(c); + bch2_btree_node_hash_remove(bc, b2); + + if (b) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + } else { + b = b2; + list_del_init(&b->list); + } - bch2_btree_node_hash_remove(bc, b); + mutex_unlock(&bc->lock); - trace_btree_node_cannibalize(c); + trace_and_count(c, btree_cache_cannibalize, c); goto out; } @@ -666,13 +731,18 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { - trace_trans_restart_relock_parent_for_fill(trans->fn, - _THIS_IP_, btree_id, &path->pos); - btree_trans_restart(trans); - return ERR_PTR(-EINTR); + trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); + } + + b = bch2_btree_node_mem_alloc(c, level != 0); + + if (trans && b == ERR_PTR(-ENOMEM)) { + trans->memory_allocation_failure = true; + trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); } - b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; @@ -707,52 +777,49 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (!sync) return NULL; - if (trans && - (!bch2_trans_relock(trans) || - !bch2_btree_path_relock_intent(trans, path))) { - BUG_ON(!trans->restarted); - return ERR_PTR(-EINTR); + if (trans) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + BUG_ON(!trans->restarted); + return ERR_PTR(ret); + } } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, - btree_id, &path->pos); - btree_trans_restart(trans); - return ERR_PTR(-EINTR); + if (trans) + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } return b; } -static int lock_node_check_fn(struct six_lock *lock, void *p) -{ - struct btree *b = container_of(lock, struct btree, c.lock); - const struct bkey_i *k = p; - - return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; -} - static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { - char buf1[200], buf2[100], buf3[100]; + struct printbuf buf = PRINTBUF; if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) return; - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); - bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); - bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); - - bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" - "btree %s level %u\n" - "ptr: %s\n" - "header: btree %s level %llu\n" - "min %s max %s\n", - bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, - bch2_btree_ids[BTREE_NODE_ID(b->data)], - BTREE_NODE_LEVEL(b->data), - buf2, buf3); + prt_printf(&buf, + "btree node header doesn't match ptr\n" + "btree %s level %u\n" + "ptr: ", + bch2_btree_ids[b->c.btree_id], b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_printf(&buf, "\nheader: btree %s level %llu\n" + "min ", + bch2_btree_ids[BTREE_NODE_ID(b->data)], + BTREE_NODE_LEVEL(b->data)); + bch2_bpos_to_text(&buf, b->data->min_key); + + prt_printf(&buf, "\nmax "); + bch2_bpos_to_text(&buf, b->data->max_key); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); } static inline void btree_check_header(struct bch_fs *c, struct btree *b) @@ -784,6 +851,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; + int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -797,7 +865,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (likely(c->opts.btree_node_mem_ptr_optimization && b && b->hash_val == btree_ptr_hash_val(k))) - goto lock_node; + goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -846,14 +914,13 @@ lock_node: * was removed - and we'll bail out: */ if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(path, level + 1); + btree_node_unlock(trans, path, level + 1); - if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type, - lock_node_check_fn, (void *) k, trace_ip)) { - if (!trans->restarted) - goto retry; - return ERR_PTR(-EINTR); - } + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.level != level || @@ -862,12 +929,8 @@ lock_node: if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(trans->fn, - trace_ip, - path->btree_id, - &path->pos); - btree_trans_restart(trans); - return ERR_PTR(-EINTR); + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } } @@ -883,11 +946,13 @@ lock_node: * should_be_locked is not set on this path yet, so we need to * relock it specifically: */ - if (trans && - (!bch2_trans_relock(trans) || - !bch2_btree_path_relock_intent(trans, path))) { - BUG_ON(!trans->restarted); - return ERR_PTR(-EINTR); + if (trans) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + BUG_ON(!trans->restarted); + return ERR_PTR(ret); + } } if (!six_relock_type(&b->c.lock, lock_type, seq)) @@ -920,12 +985,13 @@ lock_node: return b; } -struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, const struct bkey_i *k, enum btree_id btree_id, unsigned level, bool nofill) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; @@ -959,9 +1025,11 @@ retry: goto out; } else { lock_node: - ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); - if (ret) - goto retry; + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.btree_id != btree_id || @@ -1023,8 +1091,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c, return PTR_ERR_OR_ZERO(b); } -void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -1036,15 +1105,15 @@ wait_on_io: /* XXX we're called from btree_gc which will be holding other btree * nodes locked - * */ + */ __bch2_btree_node_wait_on_read(b); __bch2_btree_node_wait_on_write(b); - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, 0); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; @@ -1071,15 +1140,15 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, bch2_btree_keys_stats(b, &stats); - pr_buf(out, "l %u ", b->c.level); + prt_printf(out, "l %u ", b->c.level); bch2_bpos_to_text(out, b->data->min_key); - pr_buf(out, " - "); + prt_printf(out, " - "); bch2_bpos_to_text(out, b->data->max_key); - pr_buf(out, ":\n" + prt_printf(out, ":\n" " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - pr_buf(out, "\n" + prt_printf(out, "\n" " format: u64s %u fields %u %u %u %u %u\n" " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" @@ -1107,9 +1176,21 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc) { - pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + prt_printf(out, "nr nodes:\t\t%u\n", bc->used); + prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + + prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); + prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); + prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); + prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); + prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); + prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); + prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); + prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); + prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); + prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); + } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index f7e1098..b623c70 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -4,8 +4,9 @@ #include "bcachefs.h" #include "btree_types.h" +#include "bkey_methods.h" -extern struct lock_class_key bch2_btree_node_lock_key; +extern const char * const bch2_btree_node_flags[]; struct btree_iter; @@ -20,19 +21,19 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, enum six_lock_type, unsigned long); -struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, +struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, enum btree_id, unsigned, bool); int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, const struct bkey_i *, enum btree_id, unsigned); -void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); +void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); @@ -100,6 +101,6 @@ static inline unsigned btree_blocks(struct bch_fs *c) void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); +void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 648779c..801a09f 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -70,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c, struct bpos expected_start = bkey_deleted(&prev->k->k) ? node_start : bpos_successor(prev->k->k.p); - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - if (bkey_deleted(&prev->k->k)) { - struct printbuf out = PBUF(buf1); - pr_buf(&out, "start of node: "); - bch2_bpos_to_text(&out, node_start); - } else { - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); - } - if (bpos_cmp(expected_start, bp->v.min_key)) { bch2_topology_error(c); + if (bkey_deleted(&prev->k->k)) { + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, node_start); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); + } + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); + if (__fsck_err(c, FSCK_CAN_FIX| FSCK_CAN_IGNORE| @@ -95,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c, " prev %s\n" " cur %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, - (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && + buf1.buf, buf2.buf) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - return FSCK_ERR_START_TOPOLOGY_REPAIR; + ret = -BCH_ERR_need_topology_repair; + goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); } @@ -109,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c, if (is_last && bpos_cmp(cur.k->k.p, node_end)) { bch2_topology_error(c); + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); + bch2_bpos_to_text(&buf2, node_end); + if (__fsck_err(c, FSCK_CAN_FIX| FSCK_CAN_IGNORE| @@ -117,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c, " %s\n" " expected %s", bch2_btree_ids[b->c.btree_id], b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), - (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && + buf1.buf, buf2.buf) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - return FSCK_ERR_START_TOPOLOGY_REPAIR; + ret = -BCH_ERR_need_topology_repair; + goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); } } bch2_bkey_buf_copy(prev, c, cur.k); +err: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -156,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct bch_fs *c, +static void bch2_btree_node_update_key_early(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_i *new) { + struct bch_fs *c = trans->c; struct btree *b; struct bkey_buf tmp; int ret; @@ -167,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c, bch2_bkey_buf_init(&tmp); bch2_bkey_buf_reassemble(&tmp, c, old); - b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); @@ -205,7 +215,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) } bch2_btree_node_drop_keys_outside_node(b); - + bkey_copy(&b->key, &new->k_i); return 0; } @@ -251,18 +261,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; if (!prev) { - struct printbuf out = PBUF(buf1); - pr_buf(&out, "start of node: "); - bch2_bpos_to_text(&out, b->data->min_key); + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, b->data->min_key); } else { - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); if (prev && bpos_cmp(expected_start, cur->data->min_key) > 0 && @@ -275,8 +284,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " node %s\n" " next %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) - return DROP_PREV_NODE; + buf1.buf, buf2.buf)) { + ret = DROP_PREV_NODE; + goto out; + } if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, bpos_predecessor(cur->data->min_key)), c, @@ -284,7 +295,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " node %s\n" " next %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) + buf1.buf, buf2.buf)) ret = set_node_max(c, prev, bpos_predecessor(cur->data->min_key)); } else { @@ -296,50 +307,61 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " prev %s\n" " node %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) - return DROP_THIS_NODE; + buf1.buf, buf2.buf)) { + ret = DROP_THIS_NODE; + goto out; + } if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " node %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) - ret = set_node_min(c, cur, expected_start); + buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); } +out: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, struct btree *child) { - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); + bch2_bpos_to_text(&buf2, b->key.k.p); + if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", bch2_btree_ids[b->c.btree_id], b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1), - (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) { + buf1.buf, buf2.buf)) { ret = set_node_max(c, child, b->key.k.p); if (ret) - return ret; + goto err; } +err: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } -static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; bool have_child, dropped_children = false; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) @@ -358,28 +380,32 @@ again: bch2_btree_and_journal_iter_advance(&iter); bch2_bkey_buf_reassemble(&cur_k, c, k); - cur = bch2_btree_node_get_noiter(c, cur_k.k, + cur = bch2_btree_node_get_noiter(trans, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + if (mustfix_fsck_err_on(ret == -EIO, c, - "Unreadable btree node at btree %s level %u:\n" + "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { - bch2_btree_node_evict(c, cur_k.k); + buf.buf)) { + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); + cur = NULL; if (ret) break; continue; } if (ret) { - bch_err(c, "%s: error %i getting btree node", - __func__, ret); + bch_err(c, "%s: error getting btree node: %s", + __func__, bch2_err_str(ret)); break; } @@ -387,9 +413,10 @@ again: if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(c, cur_k.k); + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); + cur = NULL; if (ret) break; continue; @@ -400,7 +427,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { - bch2_btree_node_evict(c, prev_k.k); + bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); if (ret) @@ -440,23 +467,23 @@ again: bch2_bkey_buf_reassemble(&cur_k, c, k); bch2_btree_and_journal_iter_advance(&iter); - cur = bch2_btree_node_get_noiter(c, cur_k.k, + cur = bch2_btree_node_get_noiter(trans, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); if (ret) { - bch_err(c, "%s: error %i getting btree node", - __func__, ret); + bch_err(c, "%s: error getting btree node: %s", + __func__, bch2_err_str(ret)); goto err; } - ret = bch2_btree_repair_topology_recurse(c, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur); six_unlock_read(&cur->c.lock); cur = NULL; if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(c, cur_k.k); + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); dropped_children = true; @@ -468,12 +495,14 @@ again: have_child = true; } + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + if (mustfix_fsck_err_on(!have_child, c, "empty interior btree node at btree %s level %u\n" " %s", bch2_btree_ids[b->c.btree_id], - b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf))) + b->c.level, buf.buf)) ret = DROP_THIS_NODE; err: fsck_err: @@ -489,42 +518,49 @@ fsck_err: if (!ret && dropped_children) goto again; + printbuf_exit(&buf); return ret; } static int bch2_repair_topology(struct bch_fs *c) { + struct btree_trans trans; struct btree *b; unsigned i; int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < BTREE_ID_NR && !ret; i++) { b = c->btree_roots[i].b; if (btree_node_fake(b)) continue; - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_repair_topology_recurse(c, b); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(&trans, b); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { bch_err(c, "empty btree root - repair unimplemented"); - ret = FSCK_ERR_EXIT; + ret = -BCH_ERR_fsck_repair_unimplemented; } } + bch2_trans_exit(&trans); + return ret; } -static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, +static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); const union bch_extent_entry *entry; struct extent_ptr_decoded p = { 0 }; bool do_update = false; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; /* @@ -536,72 +572,78 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); - if (fsck_err_on(!g->gen_valid, c, + if (c->opts.reconstruct_alloc || + fsck_err_on(!g->gen_valid, c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; g->gen_valid = true; + g->gen = p.ptr.gen; } else { do_update = true; } } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; g->gen_valid = true; - g->_mark.data_type = 0; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; } } - if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (fsck_err_on(!p.ptr.cached && - gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + gen_cmp(p.ptr.gen, g->gen) < 0, c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) continue; - if (fsck_err_on(g->mark.data_type && - g->mark.data_type != data_type, c, + if (fsck_err_on(g->data_type && + g->data_type != data_type, c, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->mark.data_type], + bch2_data_types[g->data_type], bch2_data_types[data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { - g->_mark.data_type = data_type; + g->data_type = data_type; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; @@ -615,14 +657,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; } } @@ -635,13 +679,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, if (is_root) { bch_err(c, "cannot update btree roots yet"); - return -EINVAL; + ret = -EINVAL; + goto err; } new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { bch_err(c, "%s: error allocating new key", __func__); - return -ENOMEM; + ret = -ENOMEM; + goto err; } bkey_reassemble(new, *k); @@ -657,7 +703,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); - ptr->gen = g->mark.gen; + ptr->gen = g->gen; } } else { bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ @@ -666,12 +712,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || (!ptr->cached && - gen_cmp(ptr->gen, g->mark.gen) < 0) || - gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->mark.data_type && - g->mark.data_type != data_type); + gen_cmp(ptr->gen, g->gen) < 0) || + gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type); })); again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); @@ -705,19 +751,27 @@ found: ret = bch2_journal_key_insert_take(c, btree_id, level, new); if (ret) { kfree(new); - return ret; + goto err; } if (level) - bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); + bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + + if (c->opts.verbose) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, *k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } - bch2_bkey_val_to_text(&PBUF(buf), c, *k); - bch_info(c, "updated %s", buf); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf); *k = bkey_i_to_s_c(new); } +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -740,9 +794,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (initial) { BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > journal_cur_seq(&c->journal)); + k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); if (ret) goto err; @@ -753,11 +807,12 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } - ret = bch2_mark_key(trans, old, *k, flags); + ret = commit_do(trans, NULL, NULL, 0, + bch2_mark_key(trans, old, *k, flags)); fsck_err: err: if (ret) - bch_err(c, "%s: ret %i", __func__, ret); + bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); return ret; } @@ -807,10 +862,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct btree_iter iter; struct btree *b; - unsigned depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 - : 0; + unsigned depth = metadata_only ? 1 : 0; int ret = 0; gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -851,7 +903,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); @@ -866,7 +918,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, true); if (ret) { - bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); + bch_err(c, "%s: error from bch2_gc_mark_key: %s", + __func__, bch2_err_str(ret)); goto fsck_err; } @@ -896,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b bch2_bkey_buf_reassemble(&cur, c, k); bch2_btree_and_journal_iter_advance(&iter); - child = bch2_btree_node_get_noiter(c, cur.k, + child = bch2_btree_node_get_noiter(trans, cur.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(child); @@ -912,9 +965,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { - ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + ret = -BCH_ERR_need_topology_repair; bch_info(c, "Halting mark and sweep to start topology repair pass"); goto fsck_err; } else { @@ -925,8 +979,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b continue; } } else if (ret) { - bch_err(c, "%s: error %i getting btree node", - __func__, ret); + bch_err(c, "%s: error getting btree node: %s", + __func__, bch2_err_str(ret)); break; } @@ -942,6 +996,7 @@ fsck_err: bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); + printbuf_exit(&buf); return ret; } @@ -951,11 +1006,8 @@ static int bch2_gc_btree_init(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree *b; - unsigned target_depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 - : 0; - char buf[100]; + unsigned target_depth = metadata_only ? 1 : 0; + struct printbuf buf = PRINTBUF; int ret = 0; b = c->btree_roots[btree_id].b; @@ -964,19 +1016,21 @@ static int bch2_gc_btree_init(struct btree_trans *trans, return 0; six_lock_read(&b->c.lock, NULL, NULL); + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, - "btree root with incorrect min_key: %s", - (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { + "btree root with incorrect min_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); - ret = FSCK_ERR_EXIT; + ret = -BCH_ERR_fsck_repair_unimplemented; goto fsck_err; } + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, - "btree root with incorrect max_key: %s", - (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { + "btree root with incorrect max_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); - ret = FSCK_ERR_EXIT; + ret = -BCH_ERR_fsck_repair_unimplemented; goto fsck_err; } @@ -993,7 +1047,8 @@ fsck_err: six_unlock_read(&b->c.lock); if (ret < 0) - bch_err(c, "%s: ret %i", __func__, ret); + bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + printbuf_exit(&buf); return ret; } @@ -1012,6 +1067,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) bch2_trans_init(&trans, c, 0, 0); + if (initial) + trans.is_initial_gc = true; + for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); @@ -1022,7 +1080,7 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) : bch2_gc_btree(&trans, ids[i], initial, metadata_only); if (ret < 0) - bch_err(c, "%s: ret %i", __func__, ret); + bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); bch2_trans_exit(&trans); return ret; @@ -1113,10 +1171,10 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(ca, c, i) { - kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - ca->buckets[1] = NULL; + ca->buckets_gc = NULL; free_percpu(ca->usage_gc); ca->usage_gc = NULL; @@ -1130,29 +1188,29 @@ static int bch2_gc_done(struct bch_fs *c, bool initial, bool metadata_only) { struct bch_dev *ca = NULL; - bool verify = !metadata_only && (!initial || - (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + struct printbuf buf = PRINTBUF; + bool verify = !metadata_only && + !c->opts.reconstruct_alloc && + (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i, dev; int ret = 0; percpu_down_write(&c->mark_lock); #define copy_field(_f, _msg, ...) \ - if (dst->_f != src->_f) { \ - if (verify) \ - fsck_err(c, _msg ": got %llu, should be %llu" \ - , ##__VA_ARGS__, dst->_f, src->_f); \ - dst->_f = src->_f; \ - } + if (dst->_f != src->_f && \ + (!verify || \ + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f))) \ + dst->_f = src->_f #define copy_stripe_field(_f, _msg, ...) \ - if (dst->_f != src->_f) { \ - if (verify) \ - fsck_err(c, "stripe %zu has wrong "_msg \ - ": got %u, should be %u", \ - iter.pos, ##__VA_ARGS__, \ - dst->_f, src->_f); \ - dst->_f = src->_f; \ - } + if (dst->_f != src->_f && \ + (!verify || \ + fsck_err(c, "stripe %zu has wrong "_msg \ + ": got %u, should be %u", \ + iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f))) \ + dst->_f = src->_f #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ @@ -1168,7 +1226,6 @@ static int bch2_gc_done(struct bch_fs *c, dev_usage_u64s()); copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); @@ -1200,16 +1257,16 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - char buf[80]; if (metadata_only && (e->data_type == BCH_DATA_user || e->data_type == BCH_DATA_cached)) continue; - bch2_replicas_entry_to_text(&PBUF(buf), e); + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, e); - copy_fs_field(replicas[i], "%s", buf); + copy_fs_field(replicas[i], "%s", buf.buf); } } @@ -1221,9 +1278,10 @@ fsck_err: if (ca) percpu_ref_put(&ca->ref); if (ret) - bch_err(c, "%s: ret %i", __func__, ret); + bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); return ret; } @@ -1243,7 +1301,7 @@ static int bch2_gc_start(struct bch_fs *c, } for_each_member_device(ca, c, i) { - BUG_ON(ca->buckets[1]); + BUG_ON(ca->buckets_gc); BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); @@ -1252,89 +1310,123 @@ static int bch2_gc_start(struct bch_fs *c, percpu_ref_put(&ca->ref); return -ENOMEM; } + + this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, + ca->mi.nbuckets - ca->mi.first_bucket); } return 0; } +/* returns true if not equal */ +static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, + struct bch_alloc_v4 r) +{ + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type || + l.dirty_sectors != r.dirty_sectors || + l.cached_sectors != r.cached_sectors || + l.stripe_redundancy != r.stripe_redundancy || + l.stripe != r.stripe; +} + static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, - bool initial, bool metadata_only) + struct bkey_s_c k, + bool metadata_only) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket *g; - struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u, gc_u; - struct bkey_alloc_buf *a; + struct bucket gc, *b; + struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old, new; + enum bch_data_type type; int ret; - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; + if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + return 1; - old_u = new_u = bch2_alloc_unpack(k); + bch2_alloc_to_v4(k, &old); + new = old; percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, iter->pos.offset); - gc_u = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; + b = gc_bucket(ca, iter->pos.offset); + + /* + * b->data_type doesn't yet include need_discard & need_gc_gen states - + * fix that here: + */ + type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + old, + b->data_type); + if (b->data_type != type) { + struct bch_dev_usage *u; + + preempt_disable(); + u = this_cpu_ptr(ca->usage_gc); + u->d[b->data_type].buckets--; + b->data_type = type; + u->d[b->data_type].buckets++; + preempt_enable(); + } + + gc = *b; percpu_up_read(&c->mark_lock); if (metadata_only && - gc_u.data_type != BCH_DATA_sb && - gc_u.data_type != BCH_DATA_journal && - gc_u.data_type != BCH_DATA_btree) + gc.data_type != BCH_DATA_sb && + gc.data_type != BCH_DATA_journal && + gc.data_type != BCH_DATA_btree) return 0; - if (gen_after(old_u.gen, gc_u.gen)) + if (gen_after(old.gen, gc.gen)) return 0; #define copy_bucket_field(_f) \ - if (fsck_err_on(new_u._f != gc_u._f, c, \ + if (c->opts.reconstruct_alloc || \ + fsck_err_on(new._f != gc._f, c, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ - new_u.gen, \ - bch2_data_types[new_u.data_type], \ - new_u._f, gc_u._f)) \ - new_u._f = gc_u._f; \ + gc.gen, \ + bch2_data_types[gc.data_type], \ + new._f, gc._f)) \ + new._f = gc._f; \ copy_bucket_field(gen); copy_bucket_field(data_type); - copy_bucket_field(stripe); copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); copy_bucket_field(stripe_redundancy); copy_bucket_field(stripe); #undef copy_bucket_field - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + if (!bch2_alloc_v4_cmp(old, new)) return 0; - a = bch2_alloc_pack(trans, new_u); - if (IS_ERR(a)) - return PTR_ERR(a); + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + a->v = new; - ret = initial - ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) - : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); + /* + * The trigger normally makes sure this is set, but we're not running + * triggers: + */ + if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) + a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); fsck_err: return ret; } -static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { struct btree_trans trans; struct btree_iter iter; @@ -1346,37 +1438,33 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only bch2_trans_init(&trans, c, 0, 0); for_each_member_device(ca, c, i) { - for_each_btree_key(&trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS| - BTREE_ITER_PREFETCH, k, ret) { - if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) - break; - - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(&trans, &iter, - initial, metadata_only)); - if (ret) - break; - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) { - bch_err(c, "error writing alloc info: %i", ret); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, k, metadata_only)); + + if (ret < 0) { + bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); percpu_ref_put(&ca->ref); break; } } bch2_trans_exit(&trans); - return ret; + return ret < 0 ? ret : 0; } -static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; + struct bch_alloc_v4 a; unsigned i; + int ret; for_each_member_device(ca, c, i) { struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + @@ -1384,119 +1472,147 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_onl GFP_KERNEL|__GFP_ZERO); if (!buckets) { percpu_ref_put(&ca->ref); - percpu_up_write(&c->mark_lock); bch_err(c, "error allocating ca->buckets[gc]"); return -ENOMEM; } buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; - rcu_assign_pointer(ca->buckets[1], buckets); + rcu_assign_pointer(ca->buckets_gc, buckets); }; - return bch2_alloc_read(c, true, metadata_only); + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); + + bch2_alloc_to_v4(k, &a); + + g->gen_valid = 1; + g->gen = a.gen; + + if (metadata_only && + (a.data_type == BCH_DATA_user || + a.data_type == BCH_DATA_cached || + a.data_type == BCH_DATA_parity)) { + g->data_type = a.data_type; + g->dirty_sectors = a.dirty_sectors; + g->cached_sectors = a.cached_sectors; + g->stripe = a.stripe; + g->stripe_redundancy = a.stripe_redundancy; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); + + return ret; } -static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) +static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) { struct bch_dev *ca; unsigned i; for_each_member_device(ca, c, i) { - struct bucket_array *buckets = __bucket_array(ca, true); + struct bucket_array *buckets = gc_bucket_array(ca); struct bucket *g; for_each_bucket(g, buckets) { if (metadata_only && - (g->mark.data_type == BCH_DATA_user || - g->mark.data_type == BCH_DATA_cached || - g->mark.data_type == BCH_DATA_parity)) + (g->data_type == BCH_DATA_user || + g->data_type == BCH_DATA_cached || + g->data_type == BCH_DATA_parity)) continue; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; } }; } -static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, - bool metadata_only) +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) { - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; struct reflink_gc *r; - size_t idx = 0; - char buf[200]; int ret = 0; - if (metadata_only) + if (!refcount) return 0; - bch2_trans_init(&trans, c, 0, 0); + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } - r = genradix_ptr(&c->reflink_gc_table, idx++); - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - ret = -EINVAL; - break; - } + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new; - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - r->refcount)) { - struct bkey_i *new; - - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - break; - } + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; - bkey_reassemble(new, k); - - if (!r->refcount) { - new->k.type = KEY_TYPE_deleted; - /* - * XXX ugly: bch2_journal_key_insert() queues up - * the key for the journal replay code, which - * doesn't run the extent overwrite pass - */ - if (initial) - new->k.size = 0; - } else { - *bkey_refcount(new) = cpu_to_le64(r->refcount); - } + bkey_reassemble(new, k); - ret = initial - ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) - : __bch2_trans_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); - kfree(new); + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(new) = cpu_to_le64(r->refcount); - if (ret) - break; - } + ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: - bch2_trans_iter_exit(&trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + size_t idx = 0; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); + c->reflink_gc_nr = 0; bch2_trans_exit(&trans); return ret; } -static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, +static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { struct btree_trans trans; @@ -1535,8 +1651,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, - bool metadata_only) +static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) { struct genradix_iter iter; struct reflink_gc *r; @@ -1545,71 +1660,77 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, r->refcount = 0; } -static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, - bool metadata_only) +static int bch2_gc_write_stripes_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct gc_stripe *m; + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; const struct bch_stripe *s; - char buf[200]; + struct gc_stripe *m; unsigned i; int ret = 0; - if (metadata_only) + if (k.k->type != KEY_TYPE_stripe) return 0; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_stripe) - continue; + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - s = bkey_s_c_to_stripe(k).v; - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) - if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) - goto inconsistent; - continue; + for (i = 0; i < s->nr_blocks; i++) + if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) + goto inconsistent; + return 0; inconsistent: - if (fsck_err_on(true, c, - "stripe has wrong block sector count %u:\n" - " %s\n" - " should be %u", i, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - m ? m->block_sectors[i] : 0)) { - struct bkey_i_stripe *new; - - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - break; - } + if (fsck_err_on(true, c, + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; - bkey_reassemble(&new->k_i, k); + bkey_reassemble(&new->k_i, k); - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - ret = initial - ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) - : __bch2_trans_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); - kfree(new); - } + ret = bch2_trans_update(trans, iter, &new->k_i, 0); } fsck_err: - bch2_trans_iter_exit(&trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_stripes_key(&trans, &iter, k)); bch2_trans_exit(&trans); return ret; } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, - bool metadata_only) +static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) { genradix_free(&c->gc_stripes); } @@ -1634,23 +1755,18 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, */ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { - struct bch_dev *ca; - u64 start_time = local_clock(); - unsigned i, iter = 0; + unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); - trace_gc_start(c); down_write(&c->gc_lock); - /* flush interior btree updates: */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + bch2_btree_interior_updates_flush(c); ret = bch2_gc_start(c, metadata_only) ?: - bch2_gc_alloc_start(c, initial, metadata_only) ?: - bch2_gc_reflink_start(c, initial, metadata_only); + bch2_gc_alloc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, metadata_only); if (ret) goto out; again: @@ -1661,26 +1777,27 @@ again: if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && c->opts.fix_errors != FSCK_OPT_NO) { - bch_info(c, "starting topology repair pass"); + bch_info(c, "Starting topology repair pass"); ret = bch2_repair_topology(c); if (ret) goto out; - bch_info(c, "topology repair pass done"); + bch_info(c, "Topology repair pass done"); set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); } ret = bch2_gc_btrees(c, initial, metadata_only); - if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR && + if (ret == -BCH_ERR_need_topology_repair && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true); ret = 0; } - if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR) - ret = FSCK_ERR_EXIT; + if (ret == -BCH_ERR_need_topology_repair) + ret = -BCH_ERR_fsck_errors_not_fixed; if (ret) goto out; @@ -1705,9 +1822,9 @@ again: clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - bch2_gc_stripes_reset(c, initial, metadata_only); - bch2_gc_alloc_reset(c, initial, metadata_only); - bch2_gc_reflink_reset(c, initial, metadata_only); + bch2_gc_stripes_reset(c, metadata_only); + bch2_gc_alloc_reset(c, metadata_only); + bch2_gc_reflink_reset(c, metadata_only); /* flush fsck errors, reset counters */ bch2_flush_fsck_errs(c); @@ -1717,9 +1834,9 @@ out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: - bch2_gc_reflink_done(c, initial, metadata_only) ?: - bch2_gc_alloc_done(c, initial, metadata_only) ?: + ret = bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only) ?: + bch2_gc_alloc_done(c, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); @@ -1734,16 +1851,6 @@ out: up_write(&c->gc_lock); - trace_gc_end(c); - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - - /* - * Wake up allocator in case it was waiting for buckets - * because of not being able to inc gens - */ - for_each_member_device(ca, c, i) - bch2_wake_allocator(ca); - /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: @@ -1752,10 +1859,15 @@ out: return ret; } -static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) +static int gc_btree_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; + struct bkey_i *u; + int ret; percpu_down_read(&c->mark_lock); bkey_for_each_ptr(ptrs, ptr) { @@ -1763,7 +1875,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) if (ptr_stale(ca, ptr) > 16) { percpu_up_read(&c->mark_lock); - return true; + goto update; } } @@ -1775,84 +1887,41 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) *gen = ptr->gen; } percpu_up_read(&c->mark_lock); + return 0; +update: + u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - return false; -} - -/* - * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree - * node pointers currently never have cached pointers that can become stale: - */ -static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_buf sk; - int ret = 0, commit_err = 0; - - bch2_bkey_buf_init(&sk); - - bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, - BTREE_ITER_PREFETCH| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - - while ((bch2_trans_begin(trans), - k = bch2_btree_iter_peek(&iter)).k) { - ret = bkey_err(k); - - if (ret == -EINTR) - continue; - if (ret) - break; - - c->gc_gens_pos = iter.pos; - - if (gc_btree_gens_key(c, k) && !commit_err) { - bch2_bkey_buf_reassemble(&sk, c, k); - bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - - commit_err = - bch2_trans_update(trans, &iter, sk.k, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_NOFAIL); - if (commit_err == -EINTR) { - commit_err = 0; - continue; - } - } - - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); - - bch2_bkey_buf_exit(&sk, c); + bkey_reassemble(u, k); - return ret; + bch2_extent_normalize(c, bkey_i_to_s(u)); + return bch2_trans_update(trans, iter, u, 0); } -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) { struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); - struct bkey_s_c k; - struct bkey_alloc_unpacked u; + struct bch_alloc_v4 a; + struct bkey_i_alloc_v4 *a_mut; int ret; - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - u = bch2_alloc_unpack(k); + bch2_alloc_to_v4(k, &a); - if (u.oldest_gen == ca->oldest_gen[iter->pos.offset]) + if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) return 0; - u.oldest_gen = ca->oldest_gen[iter->pos.offset]; + a_mut = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + return ret; + + a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; + a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); - return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); + return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } int bch2_gc_gens(struct bch_fs *c) @@ -1873,6 +1942,7 @@ int bch2_gc_gens(struct bch_fs *c) if (!mutex_trylock(&c->gc_gens_lock)) return 0; + trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); bch2_trans_init(&trans, c, 0, 0); @@ -1896,27 +1966,36 @@ int bch2_gc_gens(struct bch_fs *c) } for (i = 0; i < BTREE_ID_NR; i++) - if ((1 << i) & BTREE_ID_HAS_PTRS) { + if (btree_type_has_ptrs(i)) { + struct btree_iter iter; + struct bkey_s_c k; + c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = bch2_gc_btree_gens(&trans, i); - if (ret) { - bch_err(c, "error recalculating oldest_gen: %i", ret); + ret = for_each_btree_key_commit(&trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + gc_btree_gens_key(&trans, &iter, k)); + if (ret && ret != -EROFS) + bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + if (ret) goto err; - } } - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(&trans, &iter)); - if (ret) { - bch_err(c, "error writing oldest_gen: %i", ret); - break; - } - } - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter, k)); + if (ret && ret != -EROFS) + bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); + if (ret) + goto err; c->gc_gens_btree = 0; c->gc_gens_pos = POS_MIN; @@ -1924,6 +2003,7 @@ int bch2_gc_gens(struct bch_fs *c) c->gc_count++; bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + trace_and_count(c, gc_gens_end, c); err: for_each_member_device(ca, c, i) { kvfree(ca->oldest_gen); @@ -1985,7 +2065,7 @@ static int bch2_gc_thread(void *arg) ret = bch2_gc_gens(c); #endif if (ret < 0) - bch_err(c, "btree gc failed: %i", ret); + bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); debug_check_no_locks_held(); } @@ -2015,7 +2095,7 @@ int bch2_gc_thread_start(struct bch_fs *c) p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); if (IS_ERR(p)) { - bch_err(c, "error creating gc thread: %li", PTR_ERR(p)); + bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); return PTR_ERR(p); } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 0665f59..95d803b 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -102,4 +102,11 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } +static inline void bch2_do_gc_gens(struct bch_fs *c) +{ + atomic_inc(&c->kick_gc); + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index a365132..dd6b536 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) }; if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { - bch2_btree_node_write(c, b, SIX_LOCK_write); + bch2_btree_node_write(c, b, SIX_LOCK_write, 0); reinit_iter = true; } } @@ -501,7 +501,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, struct btree *b) { - pr_buf(out, "%s level %u/%u\n ", + prt_printf(out, "%s level %u/%u\n ", bch2_btree_ids[b->c.btree_id], b->c.level, c->btree_roots[b->c.btree_id].level); @@ -513,17 +513,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - pr_buf(out, "error validating btree node "); - if (write) - pr_buf(out, "before write "); + prt_printf(out, bch2_log_msg(c, "")); + if (!write) + prt_str(out, "error validating btree node "); + else + prt_str(out, "corrupt btree node before write "); if (ca) - pr_buf(out, "on %s ", ca->name); - pr_buf(out, "at btree "); + prt_printf(out, "on %s ", ca->name); + prt_printf(out, "at btree "); btree_pos_to_text(out, c, b); - pr_buf(out, "\n node offset %u", b->written); + prt_printf(out, "\n node offset %u", b->written); if (i) - pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + prt_str(out, ": "); } enum btree_err_type { @@ -540,32 +543,25 @@ enum btree_validate_ret { #define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ __label__ out; \ - char _buf[300]; \ - char *_buf2 = _buf; \ - struct printbuf out = PBUF(_buf); \ - \ - _buf2 = kmalloc(4096, GFP_ATOMIC); \ - if (_buf2) \ - out = _PBUF(_buf2, 4986); \ + struct printbuf out = PRINTBUF; \ \ btree_err_msg(&out, c, ca, b, i, b->written, write); \ - pr_buf(&out, ": " msg, ##__VA_ARGS__); \ + prt_printf(&out, msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", _buf2); \ + mustfix_fsck_err(c, "%s", out.buf); \ goto out; \ } \ \ + bch2_print_string_as_lines(KERN_ERR, out.buf); \ + \ switch (write) { \ case READ: \ - if (_buf2) \ - bch_err(c, "%s", _buf2); \ - \ switch (type) { \ case BTREE_ERR_FIXABLE: \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ case BTREE_ERR_WANT_RETRY: \ if (have_retry) { \ @@ -577,22 +573,19 @@ enum btree_validate_ret { ret = BTREE_RETRY_READ; \ goto fsck_err; \ case BTREE_ERR_FATAL: \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", _buf2); \ - \ if (bch2_fs_inconsistent(c)) { \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ break; \ } \ out: \ - if (_buf2 != _buf) \ - kfree(_buf2); \ + printbuf_exit(&out); \ true; \ }) @@ -624,7 +617,6 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) (u64 *) vstruct_end(i) - (u64 *) k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); } for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) @@ -634,10 +626,14 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) if (k != vstruct_last(i)) { i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); } } + /* + * Always rebuild search trees: eytzinger search tree nodes directly + * depend on the values of min/max key: + */ + bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { @@ -653,8 +649,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, { unsigned version = le16_to_cpu(i->version); const char *err; - char buf1[100]; - char buf2[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; int ret = 0; btree_err_on((version != BCH_BSET_VERSION_OLD && @@ -691,7 +687,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; - return 0; + ret = 0; + goto out; } btree_err_on(offset && !i->u64s, @@ -742,14 +739,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", - (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), - (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), + (printbuf_reset(&buf2), + bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); } btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", - (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); if (write) compat_btree_node(b->c.level, b->c.btree_id, version, @@ -764,16 +764,29 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } +out: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } +static int bset_key_invalid(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw, + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: + (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: + (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned *whiteout_u64s, - int write, bool have_retry) + struct bset *i, int write, bool have_retry) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; + struct printbuf buf = PRINTBUF; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); int ret = 0; @@ -782,7 +795,6 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k != vstruct_last(i);) { struct bkey_s u; struct bkey tmp; - const char *invalid; if (btree_err_on(bkey_next(k) > vstruct_last(i), BTREE_ERR_FIXABLE, c, NULL, b, i, @@ -808,15 +820,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: - (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: - (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); - if (invalid) { - char buf[160]; + printbuf_reset(&buf); + if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "invalid bkey: "); + bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); - bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, - "invalid bkey: %s\n%s", invalid, buf); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -830,18 +842,17 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, &b->format, k); if (prev && bkey_iter_cmp(b, prev, k) > 0) { - char buf1[80]; - char buf2[80]; struct bkey up = bkey_unpack_key(b, prev); - bch2_bkey_to_text(&PBUF(buf1), &up); - bch2_bkey_to_text(&PBUF(buf2), u.k); + printbuf_reset(&buf); + prt_printf(&buf, "keys out of order: "); + bch2_bkey_to_text(&buf, &up); + prt_printf(&buf, " > "); + bch2_bkey_to_text(&buf, u.k); bch2_dump_bset(c, b, i, 0); - if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, - "keys out of order: %s > %s", - buf1, buf2)) { + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); @@ -853,6 +864,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k = bkey_next(k); } fsck_err: + printbuf_exit(&buf); return ret; } @@ -871,9 +883,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned u64s; unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); + struct printbuf buf = PRINTBUF; int ret, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; + /* We might get called multiple times on read retry: */ + b->written = 0; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); sort_iter_init(iter, b); @@ -885,11 +900,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, - "bad magic"); + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); btree_err_on(!b->data->keys.seq, BTREE_ERR_MUST_RETRY, c, ca, b, NULL, - "bad btree header"); + "bad btree header: seq 0"); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bch_btree_ptr_v2 *bp = @@ -902,7 +918,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, } while (b->written < (ptr_written ?: btree_sectors(c))) { - unsigned sectors, whiteout_u64s = 0; + unsigned sectors; struct nonce nonce; struct bch_csum csum; bool first = !b->written; @@ -922,9 +938,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i", ret)) + goto fsck_err; - btree_err_on(btree_node_is_extents(b) && + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), BTREE_ERR_FATAL, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -949,7 +968,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i\n", ret)) + goto fsck_err; sectors = vstruct_sectors(bne, c->block_bits); } @@ -965,8 +987,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, &whiteout_u64s, - READ, have_retry); + ret = validate_bset_keys(c, b, i, READ, have_retry); if (ret) goto fsck_err; @@ -992,11 +1013,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (blacklisted && !first) continue; - sort_iter_add(iter, i->start, - vstruct_idx(i, whiteout_u64s)); - sort_iter_add(iter, - vstruct_idx(i, whiteout_u64s), + vstruct_idx(i, 0), vstruct_last(i)); nonblacklisted_written = b->written; @@ -1056,16 +1074,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, for (k = i->start; k != vstruct_last(i);) { struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - const char *invalid = bch2_bkey_val_invalid(c, u.s_c); - if (invalid || + printbuf_reset(&buf); + + if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { - char buf[160]; + printbuf_reset(&buf); + + prt_printf(&buf, "invalid bkey: "); + bch2_bkey_val_invalid(c, u.s_c, READ, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); - bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, - "invalid bkey %s: %s", buf, invalid); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1102,6 +1124,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_node_need_rewrite(b); out: mempool_free(iter, &c->fill_iter); + printbuf_exit(&buf); return retry_read; fsck_err: if (ret == BTREE_RETRY_READ) { @@ -1122,18 +1145,18 @@ static void btree_node_read_work(struct work_struct *work) struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; - char buf[200]; - struct printbuf out; + struct printbuf buf = PRINTBUF; bool saw_error = false; + bool retry = false; bool can_retry; goto start; while (1) { + retry = true; bch_info(c, "retrying read"); ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); rb->have_ioref = bch2_dev_get_ioref(ca, READ); - bio_reset(bio); - bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); @@ -1144,10 +1167,10 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - out = PBUF(buf); - btree_pos_to_text(&out, c, b); + printbuf_reset(&buf); + btree_pos_to_text(&buf, c, b); bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf); + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1159,8 +1182,11 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry)) + !bch2_btree_node_read_done(c, ca, b, can_retry)) { + if (retry) + bch_info(c, "retry success"); break; + } saw_error = true; @@ -1173,6 +1199,7 @@ start: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); + printbuf_exit(&buf); if (saw_error && !btree_node_read_error(b)) bch2_btree_node_rewrite_async(c, b); @@ -1253,6 +1280,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) container_of(cl, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; + struct printbuf buf = PRINTBUF; bool dump_bset_maps = false; bool have_retry = false; int ret = 0, best = -1, write = READ; @@ -1296,8 +1324,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl) fsck_err: if (dump_bset_maps) { for (i = 0; i < ra->nr; i++) { - char buf[200]; - struct printbuf out = PBUF(buf); struct btree_node *bn = ra->buf[i]; struct btree_node_entry *bne = NULL; unsigned offset = 0, sectors; @@ -1306,6 +1332,8 @@ fsck_err: if (ra->err[i]) continue; + printbuf_reset(&buf); + while (offset < btree_sectors(c)) { if (!offset) { sectors = vstruct_sectors(bn, c->block_bits); @@ -1316,10 +1344,10 @@ fsck_err: sectors = vstruct_sectors(bne, c->block_bits); } - pr_buf(&out, " %u-%u", offset, offset + sectors); + prt_printf(&buf, " %u-%u", offset, offset + sectors); if (bne && bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), false)) - pr_buf(&out, "*"); + prt_printf(&buf, "*"); offset += sectors; } @@ -1327,19 +1355,19 @@ fsck_err: bne = ra->buf[i] + (offset << 9); if (bne->keys.seq == bn->keys.seq) { if (!gap) - pr_buf(&out, " GAP"); + prt_printf(&buf, " GAP"); gap = true; sectors = vstruct_sectors(bne, c->block_bits); - pr_buf(&out, " %u-%u", offset, offset + sectors); + prt_printf(&buf, " %u-%u", offset, offset + sectors); if (bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), false)) - pr_buf(&out, "*"); + prt_printf(&buf, "*"); } offset++; } - bch_err(c, "replica %u:%s", i, buf); + bch_err(c, "replica %u:%s", i, buf.buf); } } @@ -1360,6 +1388,7 @@ fsck_err: closure_debug_destroy(&ra->cl); kfree(ra); + printbuf_exit(&buf); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1405,8 +1434,10 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i], - btree_bytes(c)), + ra->bio[i] = bio_alloc_bioset(NULL, + buf_pages(ra->buf[i], btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, &c->btree_bio); } @@ -1422,7 +1453,6 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->idx = i; rb->pick = pick; - rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); @@ -1459,11 +1489,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; - char buf[200]; int ret; - btree_pos_to_text(&PBUF(buf), c, b); - trace_btree_read(c, b); + trace_and_count(c, btree_node_read, c, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1471,17 +1499,30 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick); - if (bch2_fs_fatal_err_on(ret <= 0, c, - "btree node read error: no device to read from\n" - " at %s", buf)) { + + if (ret <= 0) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "btree node read error: no device to read from\n at "); + btree_pos_to_text(&buf, c, b); + bch_err(c, "%s", buf.buf); + + if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) + bch2_fatal_error(c); + set_btree_node_read_error(b); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); + printbuf_exit(&buf); return; } ca = bch_dev_bkey_exists(c, pick.ptr.dev); - bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, - btree_bytes(c)), + bio = bio_alloc_bioset(NULL, + buf_pages(b->data, btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOIO, &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; @@ -1491,7 +1532,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); - bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; bch2_bio_map(bio, b->data, btree_bytes(c)); @@ -1532,7 +1572,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, level != 0); bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); @@ -1582,29 +1622,13 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); unsigned long old, new, v; bch2_btree_complete_write(c, b, w); - v = READ_ONCE(b->flags); - do { - old = new = v; - - if (old & (1U << BTREE_NODE_need_write)) - goto do_write; - - new &= ~(1U << BTREE_NODE_write_in_flight); - new &= ~(1U << BTREE_NODE_write_in_flight_inner); - } while ((v = cmpxchg(&b->flags, old, new)) != old); - - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - return; - -do_write: - six_lock_read(&b->c.lock, NULL, NULL); v = READ_ONCE(b->flags); do { old = new = v; @@ -1612,7 +1636,8 @@ do_write: if ((old & (1U << BTREE_NODE_dirty)) && (old & (1U << BTREE_NODE_need_write)) && !(old & (1U << BTREE_NODE_never_write)) && - btree_node_may_write(b)) { + !(old & (1U << BTREE_NODE_write_blocked)) && + !(old & (1U << BTREE_NODE_will_make_reachable))) { new &= ~(1U << BTREE_NODE_dirty); new &= ~(1U << BTREE_NODE_need_write); new |= (1U << BTREE_NODE_write_in_flight); @@ -1626,9 +1651,22 @@ do_write: } while ((v = cmpxchg(&b->flags, old, new)) != old); if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, true); + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + else + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +static void btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_trans trans; + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); } static void btree_node_write_work(struct work_struct *work) @@ -1712,13 +1750,19 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - unsigned whiteout_u64s = 0; + struct printbuf buf = PRINTBUF; int ret; - if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) - return -1; + ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE, &buf); + + if (ret) + bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); + printbuf_exit(&buf); + if (ret) + return ret; - ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + ret = validate_bset_keys(c, b, i, WRITE, false) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); if (ret) { bch2_inconsistent_error(c); @@ -1742,7 +1786,7 @@ static void btree_write_submit(struct work_struct *work) bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); } -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; struct bset_tree *t; @@ -1757,13 +1801,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta unsigned long old, new; bool validate_before_checksum = false; void *data; + int ret; - if (already_started) + if (flags & BTREE_WRITE_ALREADY_STARTED) goto do_write; - if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) - return; - /* * We may only have a read lock on the btree node - the dirty bit is our * "lock" against racing with other threads that may be trying to start @@ -1777,13 +1819,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta if (!(old & (1 << BTREE_NODE_dirty))) return; - if (!btree_node_may_write(b)) + if ((flags & BTREE_WRITE_ONLY_IF_NEED) && + !(old & (1 << BTREE_NODE_need_write))) + return; + + if (old & + ((1 << BTREE_NODE_never_write)| + (1 << BTREE_NODE_write_blocked))) return; - if (old & (1 << BTREE_NODE_never_write)) + if (b->written && + (old & (1 << BTREE_NODE_will_make_reachable))) return; - BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); + if (old & (1 << BTREE_NODE_write_in_flight)) + return; new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_need_write); @@ -1863,6 +1913,8 @@ do_write: u64s = bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); + set_needs_whiteout(i, false); /* do we have data to write? */ @@ -1872,6 +1924,10 @@ do_write: bytes_to_write = vstruct_end(i) - data; sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + if (!b->written && + b->key.k.type == KEY_TYPE_btree_ptr_v2) + BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); @@ -1879,7 +1935,7 @@ do_write: BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = c->sb.version < bcachefs_metadata_version_new_versioning + i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ? cpu_to_le16(BCH_BSET_VERSION_OLD) : cpu_to_le16(c->sb.version); SET_BSET_OFFSET(i, b->written); @@ -1897,7 +1953,10 @@ do_write: validate_bset_for_write(c, b, i, sectors_to_write)) goto err; - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error encrypting btree node: %i\n", ret)) + goto err; nonce = btree_nonce(i, b->written << 9); @@ -1933,10 +1992,12 @@ do_write: c->opts.nochanges) goto err; - trace_btree_write(b, bytes_to_write, sectors_to_write); + trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); - wbio = container_of(bio_alloc_bioset(GFP_NOIO, + wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), + REQ_OP_WRITE|REQ_META, + GFP_NOIO, &c->btree_bio), struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); @@ -1946,7 +2007,6 @@ do_write: wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.first_btree_write = !b->written; - wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; @@ -1956,11 +2016,6 @@ do_write: b->written += sectors_to_write; - if (wbio->wbio.first_btree_write && - b->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = - cpu_to_le16(b->written); - if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = cpu_to_le16(b->written); @@ -1973,14 +2028,10 @@ do_write: return; err: set_btree_node_noevict(b); - if (!b->written && - b->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = - cpu_to_le16(sectors_to_write); b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - btree_node_write_done(c, b); + __btree_node_write_done(c, b); } /* @@ -2043,12 +2094,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * Use this one if the node is intent locked: */ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - enum six_lock_type lock_type_held) + enum six_lock_type lock_type_held, + unsigned flags) { if (lock_type_held == SIX_LOCK_intent || (lock_type_held == SIX_LOCK_read && six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, flags); /* don't cycle lock unnecessarily: */ if (btree_node_just_written(b) && @@ -2060,64 +2112,40 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); } else { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, flags); if (lock_type_held == SIX_LOCK_write && btree_node_just_written(b)) bch2_btree_post_write_cleanup(c, b); } } -static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) +static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) { struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; unsigned i; + bool ret = false; restart: rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) if (test_bit(flag, &b->flags)) { rcu_read_unlock(); wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + ret = true; goto restart; - } rcu_read_unlock(); -} -void bch2_btree_flush_all_reads(struct bch_fs *c) -{ - __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); + return ret; } -void bch2_btree_flush_all_writes(struct bch_fs *c) +bool bch2_btree_flush_all_reads(struct bch_fs *c) { - __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); + return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); } -void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) +bool bch2_btree_flush_all_writes(struct bch_fs *c) { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) { - unsigned long flags = READ_ONCE(b->flags); - - if (!(flags & (1 << BTREE_NODE_dirty))) - continue; - - pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", - b, - (flags & (1 << BTREE_NODE_dirty)) != 0, - (flags & (1 << BTREE_NODE_need_write)) != 0, - b->c.level, - b->written, - !list_empty_careful(&b->write_blocked), - b->will_make_reachable != 0, - b->will_make_reachable & 1); - } - rcu_read_unlock(); + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 0f20224..8af8536 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -15,18 +15,13 @@ struct btree; struct btree_iter; struct btree_node_read_all; -static inline bool btree_node_dirty(struct btree *b) -{ - return test_bit(BTREE_NODE_dirty, &b->flags); -} - -static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) +static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) atomic_inc(&c->btree_cache.dirty); } -static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) +static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) atomic_dec(&c->btree_cache.dirty); @@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *); void bch2_btree_node_wait_on_read(struct btree *); void bch2_btree_node_wait_on_write(struct btree *); -static inline bool btree_node_may_write(struct btree *b) -{ - return list_empty_careful(&b->write_blocked) && - (!b->written || !b->will_make_reachable); -} - enum compact_mode { COMPACT_LAZY, COMPACT_ALL, @@ -111,22 +100,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset) }}; } -static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) { struct nonce nonce = btree_nonce(i, offset); + int ret; if (!offset) { struct btree_node *bn = container_of(i, struct btree_node, keys); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, - bytes); + ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, bytes); + if (ret) + return ret; nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); + return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); } void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); @@ -145,41 +137,23 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); -void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); +#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) +#define BTREE_WRITE_ALREADY_STARTED (1U << 1) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type); + enum six_lock_type, unsigned); static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, enum six_lock_type lock_held) { - if (b->written && - btree_node_need_write(b) && - btree_node_may_write(b) && - !btree_node_write_in_flight(b)) - bch2_btree_node_write(c, b, lock_held); + bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } -#define bch2_btree_node_write_cond(_c, _b, cond) \ -do { \ - unsigned long old, new, v = READ_ONCE((_b)->flags); \ - \ - do { \ - old = new = v; \ - \ - if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ - break; \ - \ - new |= (1 << BTREE_NODE_need_write); \ - } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ - \ - btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -} while (0) - -void bch2_btree_flush_all_reads(struct bch_fs *); -void bch2_btree_flush_all_writes(struct bch_fs *); -void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); +bool bch2_btree_flush_all_reads(struct bch_fs *); +bool bch2_btree_flush_all_writes(struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index ae63ecb..d18346a 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -16,11 +16,14 @@ #include "replicas.h" #include "subvolume.h" +#include #include #include static void btree_trans_verify_sorted(struct btree_trans *); -static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); +inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); +static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *, + struct btree_path *, int); static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, @@ -46,7 +49,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) if (need_resched() || race_fault()) { bch2_trans_unlock(trans); schedule(); - return bch2_trans_relock(trans) ? 0 : -EINTR; + return bch2_trans_relock(trans); } else { return 0; } @@ -99,12 +102,6 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos return p; } -static inline bool is_btree_node(struct btree_path *path, unsigned l) -{ - return l < BTREE_MAX_DEPTH && - (unsigned long) path->l[l].b >= 128; -} - static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; @@ -135,432 +132,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree node locking: */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) -{ - struct btree_path *linked; - unsigned readers = 0; - - trans_for_each_path(trans, linked) - if (linked->l[b->c.level].b == b && - btree_node_read_locked(linked, b->c.level)) - readers++; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - if (!b->c.lock.readers) - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); - else - this_cpu_sub(*b->c.lock.readers, readers); - - six_lock_write(&b->c.lock, NULL, NULL); - - if (!b->c.lock.readers) - atomic64_add(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); - else - this_cpu_add(*b->c.lock.readers, readers); -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (!is_btree_node(path, level)) - goto fail; - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, want))) { - mark_btree_node_locked(path, level, want); - return true; - } -fail: - trace_btree_node_relock_fail(trans->fn, _RET_IP_, - path->btree_id, - &path->pos, - (unsigned long) b, - path->l[level].lock_seq, - is_btree_node(path, level) ? b->c.lock.state.seq : 0); - return false; -} - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(path, level); - goto success; - } - - return false; -success: - mark_btree_node_intent_locked(path, level); - return true; -} - -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade) -{ - unsigned l = path->level; - int fail_idx = -1; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - fail_idx = l; - - l++; - } while (l < path->locks_want); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - bch2_trans_verify_locks(trans); - - return path->uptodate < BTREE_ITER_NEED_RELOCK; -} - -static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, - bool cached) -{ - return !cached - ? container_of(_b, struct btree, c)->key.k.p - : container_of(_b, struct bkey_cached, c)->key.pos; -} - -/* Slowpath: */ -bool __bch2_btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct btree_path *linked; - unsigned reason; - - /* Check if it's safe to block: */ - trans_for_each_path(trans, linked) { - if (!linked->nodes_locked) - continue; - - /* - * Can't block taking an intent lock if we have _any_ nodes read - * locked: - * - * - Our read lock blocks another thread with an intent lock on - * the same node from getting a write lock, and thus from - * dropping its intent lock - * - * - And the other thread may have multiple nodes intent locked: - * both the node we want to intent lock, and the node we - * already have read locked - deadlock: - */ - if (type == SIX_LOCK_intent && - linked->nodes_locked != linked->nodes_intent_locked) { - reason = 1; - goto deadlock; - } - - if (linked->btree_id != path->btree_id) { - if (linked->btree_id < path->btree_id) - continue; - - reason = 3; - goto deadlock; - } - - /* - * Within the same btree, non-cached paths come before cached - * paths: - */ - if (linked->cached != path->cached) { - if (!linked->cached) - continue; - - reason = 4; - goto deadlock; - } - - /* - * Interior nodes must be locked before their descendants: if - * another path has possible descendants locked of the node - * we're about to lock, it must have the ancestors locked too: - */ - if (level > __fls(linked->nodes_locked)) { - reason = 5; - goto deadlock; - } - - /* Must lock btree nodes in key order: */ - if (btree_node_locked(linked, level) && - bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, - linked->cached)) <= 0) { - BUG_ON(trans->in_traverse_all); - reason = 7; - goto deadlock; - } - } - - return btree_node_lock_type(trans, path, b, pos, level, - type, should_sleep_fn, p); -deadlock: - trace_trans_restart_would_deadlock(trans->fn, ip, - trans->in_traverse_all, reason, - linked->btree_id, - linked->cached, - &linked->pos, - path->btree_id, - path->cached, - &pos); - btree_trans_restart(trans); - return false; -} - -/* Btree iterator locking: */ - -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_locks(struct btree_path *path) -{ - unsigned l; - - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); - return; - } - - for (l = 0; btree_path_node(path, l); l++) - BUG_ON(btree_lock_want(path, l) != - btree_node_locked_type(path, l)); -} - -void bch2_trans_verify_locks(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_verify_locks(path); -} -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -#endif - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -bool bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, - path->btree_id, &path->pos); - btree_trans_restart(trans); - return false; - } - } - - return true; -} - -__flatten -static bool bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - bool ret = btree_path_get_locks(trans, path, false); - - if (!ret) { - trace_trans_restart_relock_path(trans->fn, trace_ip, - path->btree_id, &path->pos); - btree_trans_restart(trans); - } - return ret; -} - -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - struct btree_path *linked; - - EBUG_ON(path->locks_want >= new_locks_want); - - path->locks_want = new_locks_want; - - if (btree_path_get_locks(trans, path, true)) - return true; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - trans_for_each_path(trans, linked) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true); - } - - return false; -} - -void __bch2_btree_path_downgrade(struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = __fls(path->nodes_locked)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - path->nodes_intent_locked ^= 1 << l; - } - break; - } - } - - bch2_btree_path_verify_locks(path); -} - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_downgrade(path); -} - -/* Btree transaction locking: */ - -bool bch2_trans_relock(struct btree_trans *trans) -{ - struct btree_path *path; - - if (unlikely(trans->restarted)) - return false; - - trans_for_each_path(trans, path) - if (path->should_be_locked && - !bch2_btree_path_relock(trans, path, _RET_IP_)) { - trace_trans_restart_relock(trans->fn, _RET_IP_, - path->btree_id, &path->pos); - BUG_ON(!trans->restarted); - return false; - } - return true; -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - __bch2_btree_path_unlock(path); - - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); -} - /* Btree iterator: */ #ifdef CONFIG_BCACHEFS_DEBUG @@ -579,7 +150,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, bkey_cmp(ck->key.pos, path->pos)); if (!locked) - btree_node_unlock(path, 0); + btree_node_unlock(trans, path, 0); } static void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -589,7 +160,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_node_iter tmp; bool locked; struct bkey_packed *p, *k; - char buf1[100], buf2[100], buf3[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; const char *msg; if (!bch2_debug_check_iterators) @@ -608,7 +181,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, if (!btree_path_node(path, level)) return; - if (!bch2_btree_node_relock(trans, path, level)) + if (!bch2_btree_node_relock_notrace(trans, path, level)) return; BUG_ON(!btree_path_pos_in_node(path, l->b)); @@ -634,29 +207,30 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, } if (!locked) - btree_node_unlock(path, level); + btree_node_unlock(trans, path, level); return; err: - strcpy(buf2, "(none)"); - strcpy(buf3, "(none)"); - - bch2_bpos_to_text(&PBUF(buf1), path->pos); + bch2_bpos_to_text(&buf1, path->pos); if (p) { struct bkey uk = bkey_unpack_key(l->b, p); - bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bkey_to_text(&buf2, &uk); + } else { + prt_printf(&buf2, "(none)"); } if (k) { struct bkey uk = bkey_unpack_key(l->b, k); - bch2_bkey_to_text(&PBUF(buf3), &uk); + bch2_bkey_to_text(&buf3, &uk); + } else { + prt_printf(&buf3, "(none)"); } panic("path should be %s key at level %u:\n" "path pos %s\n" "prev key %s\n" "cur key %s\n", - msg, level, buf1, buf2, buf3); + msg, level, buf1.buf, buf2.buf, buf3.buf); } static void bch2_btree_path_verify(struct btree_trans *trans, @@ -754,16 +328,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bkey_cmp(prev.k->p, k.k->p) && bch2_snapshot_is_ancestor(trans->c, iter->snapshot, prev.k->p.snapshot) > 0) { - char buf1[100], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - bch2_bkey_to_text(&PBUF(buf1), k.k); - bch2_bkey_to_text(&PBUF(buf2), prev.k); + bch2_bkey_to_text(&buf1, k.k); + bch2_bkey_to_text(&buf2, prev.k); panic("iter snap %u\n" "k %s\n" "prev %s\n", iter->snapshot, - buf1, buf2); + buf1.buf, buf2.buf); } out: bch2_trans_iter_exit(trans, ©); @@ -775,7 +349,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, { struct btree_path *path; unsigned idx; - char buf[100]; + struct printbuf buf = PRINTBUF; trans_for_each_path_inorder(trans, path, idx) { int cmp = cmp_int(path->btree_id, id) ?: @@ -786,7 +360,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, if (cmp < 0) continue; - if (!(path->nodes_locked & 1) || + if (!btree_node_locked(path, 0) || !path->should_be_locked) continue; @@ -801,9 +375,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, } bch2_dump_trans_paths_updates(trans); + bch2_bpos_to_text(&buf, pos); + panic("not locked: %s %s%s\n", - bch2_btree_ids[id], - (bch2_bpos_to_text(&PBUF(buf), pos), buf), + bch2_btree_ids[id], buf.buf, key_cache ? " cached" : ""); } @@ -1009,27 +584,29 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, +static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(c, l, u, + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, bch2_btree_node_iter_peek(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->key.k.p; + bch2_btree_path_verify_level(trans, path, l - path->l); return k; } -static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, +static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(c, l, u, + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, bch2_btree_node_iter_prev(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->data->min_key; + bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -1052,61 +629,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, return true; } -/* - * Verify that iterator for parent node points to child node: - */ -static void btree_path_verify_new_node(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l; - unsigned plevel; - bool parent_locked; - struct bkey_packed *k; - - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - return; - - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - plevel = b->c.level + 1; - if (!btree_path_node(path, plevel)) - return; - - parent_locked = btree_node_locked(path, plevel); - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - l = &path->l[plevel]; - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (!k || - bkey_deleted(k) || - bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { - char buf1[100]; - char buf2[100]; - char buf3[100]; - char buf4[100]; - struct bkey uk = bkey_unpack_key(b, k); - - bch2_dump_btree_node(c, l->b); - bch2_bpos_to_text(&PBUF(buf1), path->pos); - bch2_bkey_to_text(&PBUF(buf2), &uk); - bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); - bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); - panic("parent iter doesn't point to new node:\n" - "iter pos %s %s\n" - "iter key %s\n" - "new node %s-%s\n", - bch2_btree_ids[path->btree_id], buf1, - buf2, buf3, buf4); - } - - if (!parent_locked) - btree_node_unlock(path, plevel); -} - static inline void __btree_path_level_init(struct btree_path *path, unsigned level) { @@ -1122,14 +644,12 @@ static inline void __btree_path_level_init(struct btree_path *path, bch2_btree_node_iter_peek(&l->iter, l->b); } -static inline void btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +inline void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { BUG_ON(path->cached); - btree_path_verify_new_node(trans, path, b); - EBUG_ON(!btree_path_pos_in_node(path, b)); EBUG_ON(b->c.lock.state.seq & 1); @@ -1149,19 +669,19 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) struct btree_path *path; trans_for_each_path(trans, path) - if (!path->cached && + if (path->uptodate == BTREE_ITER_UPTODATE && + !path->cached && btree_path_pos_in_node(path, b)) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); - if (path->nodes_locked && - t != BTREE_NODE_UNLOCKED) { - btree_node_unlock(path, b->c.level); + if (t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(trans, path, b->c.level); six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(path, b->c.level, t); + mark_btree_node_locked(trans, path, b->c.level, t); } - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); } } @@ -1179,14 +699,6 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) /* Btree path: traverse, set_pos: */ -static int lock_root_check_fn(struct six_lock *lock, void *p) -{ - struct btree *b = container_of(lock, struct btree, c.lock); - struct btree **rootp = p; - - return b == *rootp ? 0 : -1; -} - static inline int btree_path_lock_root(struct btree_trans *trans, struct btree_path *path, unsigned depth_want, @@ -1196,6 +708,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; enum six_lock_type lock_type; unsigned i; + int ret; EBUG_ON(path->nodes_locked); @@ -1217,26 +730,27 @@ static inline int btree_path_lock_root(struct btree_trans *trans, } lock_type = __btree_lock_want(path, path->level); - if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, - path->level, lock_type, - lock_root_check_fn, rootp, - trace_ip))) { - if (trans->restarted) - return -EINTR; - continue; + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + BUG(); } if (likely(b == READ_ONCE(*rootp) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) - path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); path->l[path->level].b = b; for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; - mark_btree_node_locked(path, path->level, lock_type); - btree_path_level_init(trans, path, b); + mark_btree_node_locked(trans, path, path->level, lock_type); + bch2_btree_path_level_init(trans, path, b); return 0; } @@ -1260,7 +774,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat bch2_bkey_buf_init(&tmp); - while (nr && !ret) { + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -1275,7 +789,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat } if (!was_locked) - btree_node_unlock(path, path->level); + btree_node_unlock(trans, path, path->level); bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1295,7 +809,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); - while (nr && !ret) { + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -1310,7 +824,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p } if (!was_locked) - btree_node_unlock(path, path->level); + btree_node_unlock(trans, path, path->level); bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1335,7 +849,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, bp->mem_ptr = (unsigned long)b; if (!locked) - btree_node_unlock(path, plevel); + btree_node_unlock(trans, path, plevel); } static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, @@ -1400,16 +914,16 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (unlikely(ret)) goto err; - mark_btree_node_locked(path, level, lock_type); - btree_path_level_init(trans, path, b); - if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(path, level + 1); + btree_node_unlock(trans, path, level + 1); + + mark_btree_node_locked(trans, path, level, lock_type); path->level = level; + bch2_btree_path_level_init(trans, path, b); bch2_btree_path_verify_locks(path); err: @@ -1420,40 +934,30 @@ err: static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, unsigned, unsigned long); -static int __btree_path_traverse_all(struct btree_trans *trans, int ret, - unsigned long trace_ip) +static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_path *path; - int i; + unsigned long trace_ip = _RET_IP_; + int ret = 0; if (trans->in_traverse_all) - return -EINTR; + return -BCH_ERR_transaction_restart_in_traverse_all; trans->in_traverse_all = true; retry_all: - trans->restarted = false; + trans->restarted = 0; + trans->traverse_all_idx = U8_MAX; trans_for_each_path(trans, path) path->should_be_locked = false; btree_trans_verify_sorted(trans); - for (i = trans->nr_sorted - 2; i >= 0; --i) { - struct btree_path *path1 = trans->paths + trans->sorted[i]; - struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; - - if (path1->btree_id == path2->btree_id && - path1->locks_want < path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, path2->locks_want); - else if (!path1->locks_want && path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, 1); - } - bch2_trans_unlock(trans); cond_resched(); - if (unlikely(ret == -ENOMEM)) { + if (unlikely(trans->memory_allocation_failure)) { struct closure cl; closure_init_stack(&cl); @@ -1464,15 +968,10 @@ retry_all: } while (ret); } - if (unlikely(ret == -EIO)) - goto out; - - BUG_ON(ret && ret != -EINTR); - /* Now, redo traversals in correct order: */ - i = 0; - while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + trans->traverse_all_idx = 0; + while (trans->traverse_all_idx < trans->nr_sorted) { + path = trans->paths + trans->sorted[trans->traverse_all_idx]; /* * Traversing a path can cause another path to be added at about @@ -1480,10 +979,14 @@ retry_all: */ if (path->uptodate) { ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); - if (ret) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == -ENOMEM) goto retry_all; + if (ret) + goto err; + BUG_ON(path->uptodate); } else { - i++; + trans->traverse_all_idx++; } } @@ -1494,62 +997,83 @@ retry_all: */ trans_for_each_path(trans, path) BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); -out: +err: bch2_btree_cache_cannibalize_unlock(c); trans->in_traverse_all = false; - trace_trans_traverse_all(trans->fn, trace_ip); + trace_and_count(c, trans_traverse_all, trans, trace_ip); return ret; } -static int bch2_btree_path_traverse_all(struct btree_trans *trans) +static inline bool btree_path_check_pos_in_node(struct btree_path *path, + unsigned l, int check_pos) { - return __btree_path_traverse_all(trans, 0, _RET_IP_); + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) + return false; + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) + return false; + return true; } static inline bool btree_path_good_node(struct btree_trans *trans, struct btree_path *path, unsigned l, int check_pos) { - if (!is_btree_node(path, l) || - !bch2_btree_node_relock(trans, path, l)) - return false; + return is_btree_node(path, l) && + bch2_btree_node_relock(trans, path, l) && + btree_path_check_pos_in_node(path, l, check_pos); +} - if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) - return false; - if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) - return false; - return true; +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) +{ + unsigned l; + + path->level = new_level; + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); } -static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) +static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) { unsigned i, l = path->level; - +again: while (btree_path_node(path, l) && - !btree_path_good_node(trans, path, l, check_pos)) { - btree_node_unlock(path, l); - path->l[l].b = BTREE_ITER_NO_NODE_UP; - l++; - } + !btree_path_good_node(trans, path, l, check_pos)) + __btree_path_set_level_up(trans, path, l++); /* If we need intent locks, take them too: */ for (i = l + 1; i < path->locks_want && btree_path_node(path, i); i++) - if (!bch2_btree_node_relock(trans, path, i)) - while (l <= i) { - btree_node_unlock(path, l); - path->l[l].b = BTREE_ITER_NO_NODE_UP; - l++; - } + if (!bch2_btree_node_relock(trans, path, i)) { + while (l <= i) + __btree_path_set_level_up(trans, path, l++); + goto again; + } return l; } +static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) +{ + return likely(btree_node_locked(path, path->level) && + btree_path_check_pos_in_node(path, path->level, check_pos)) + ? path->level + : __btree_path_up_until_good_node(trans, path, check_pos); +} + /* * This is the main state machine for walking down the btree - walks down to a * specified depth @@ -1565,19 +1089,17 @@ static int btree_path_traverse_one(struct btree_trans *trans, unsigned long trace_ip) { unsigned depth_want = path->level; - int ret = 0; + int ret = trans->restarted; - if (unlikely(trans->restarted)) { - ret = -EINTR; + if (unlikely(ret)) goto out; - } /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: */ if (path->should_be_locked) { - ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; + ret = bch2_btree_path_relock(trans, path, trace_ip); goto out; } @@ -1591,6 +1113,9 @@ static int btree_path_traverse_one(struct btree_trans *trans, path->level = btree_path_up_until_good_node(trans, path, 0); + EBUG_ON(btree_path_node(path, path->level) && + !btree_node_locked(path, path->level)); + /* * Note: path->nodes[path->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -1611,31 +1136,33 @@ static int btree_path_traverse_one(struct btree_trans *trans, goto out; } - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); path->level = depth_want; - - if (ret == -EIO) - path->l[path->level].b = - BTREE_ITER_NO_NODE_ERROR; - else - path->l[path->level].b = - BTREE_ITER_NO_NODE_DOWN; + path->l[path->level].b = ERR_PTR(ret); goto out; } } path->uptodate = BTREE_ITER_UPTODATE; out: - BUG_ON((ret == -EINTR) != !!trans->restarted); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_btree_path_verify(trans, path); return ret; } -static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); - int __must_check bch2_btree_path_traverse(struct btree_trans *trans, struct btree_path *path, unsigned flags) { + if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); + u64 mask = ~(~0ULL << restart_probability_bits); + + if ((prandom_u32() & mask) == mask) { + trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); + } + } + if (path->uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -1646,17 +1173,22 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans, static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, struct btree_path *src) { - unsigned i; + unsigned i, offset = offsetof(struct btree_path, pos); + int cmp = btree_path_cmp(dst, src); - memcpy(&dst->pos, &src->pos, - sizeof(struct btree_path) - offsetof(struct btree_path, pos)); + memcpy((void *) dst + offset, + (void *) src + offset, + sizeof(struct btree_path) - offset); - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_locked(dst, i)) - six_lock_increment(&dst->l[i].b->c.lock, - __btree_lock_want(dst, i)); + for (i = 0; i < BTREE_MAX_DEPTH; i++) { + unsigned t = btree_node_locked_type(dst, i); - btree_path_check_sort(trans, dst, 0); + if (t != BTREE_NODE_UNLOCKED) + six_lock_increment(&dst->l[i].b->c.lock, t); + } + + if (cmp) + bch2_btree_path_check_sort_fast(trans, dst, cmp); } static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, @@ -1669,8 +1201,7 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr return new; } -inline struct btree_path * __must_check -bch2_btree_path_make_mut(struct btree_trans *trans, +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, struct btree_path *path, bool intent, unsigned long ip) { @@ -1684,6 +1215,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans, btree_trans_verify_sorted(trans); } + path->should_be_locked = false; return path; } @@ -1703,14 +1235,13 @@ bch2_btree_path_set_pos(struct btree_trans *trans, path = bch2_btree_path_make_mut(trans, path, intent, ip); - path->pos = new_pos; - path->should_be_locked = false; + path->pos = new_pos; - btree_path_check_sort(trans, path, cmp); + bch2_btree_path_check_sort_fast(trans, path, cmp); if (unlikely(path->cached)) { - btree_node_unlock(path, 0); - path->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1718,6 +1249,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans, l = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, l)) { + BUG_ON(!btree_node_locked(path, l)); /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1729,9 +1261,9 @@ bch2_btree_path_set_pos(struct btree_trans *trans, __btree_path_level_init(path, l); } - if (l != path->level) { + if (unlikely(l != path->level)) { btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); } out: bch2_btree_path_verify(trans, path); @@ -1742,37 +1274,37 @@ out: static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) { - struct btree_path *next; + struct btree_path *sib; - next = prev_btree_path(trans, path); - if (next && !btree_path_cmp(next, path)) - return next; + sib = prev_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; - next = next_btree_path(trans, path); - if (next && !btree_path_cmp(next, path)) - return next; + sib = next_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; return NULL; } static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) { - struct btree_path *next; + struct btree_path *sib; - next = prev_btree_path(trans, path); - if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) - return next; + sib = prev_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; - next = next_btree_path(trans, path); - if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) - return next; + sib = next_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; return NULL; } static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) { - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); btree_path_list_remove(trans, path); trans->paths_allocated &= ~(1ULL << path->idx); } @@ -1787,88 +1319,165 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte if (!__btree_path_put(path, intent)) return; - /* - * Perhaps instead we should check for duplicate paths in traverse_all: - */ - if (path->preserve && - (dup = have_path_at_pos(trans, path))) { - dup->preserve = true; - path->preserve = false; - goto free; - } + dup = path->preserve + ? have_path_at_pos(trans, path) + : have_node_at_pos(trans, path); + + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + return; - if (!path->preserve && - (dup = have_node_at_pos(trans, path))) - goto free; - return; -free: if (path->should_be_locked && - !btree_node_locked(dup, path->level)) + !trans->restarted && + (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) return; - dup->should_be_locked |= path->should_be_locked; - __bch2_path_free(trans, path); + if (dup) { + dup->preserve |= path->preserve; + dup->should_be_locked |= path->should_be_locked; + } + + __bch2_path_free(trans, path); +} + +static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, + bool intent) +{ + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + __bch2_path_free(trans, path); +} + +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + prt_printf(buf, "transaction updates for %s journal seq %llu", + trans->fn, trans->journal_res.seq); + prt_newline(buf); + printbuf_indent_add(buf, 2); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + + prt_printf(buf, "update: btree=%s cached=%u %pS", + bch2_btree_ids[i->btree_id], + i->cached, + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " old "); + bch2_bkey_val_to_text(buf, trans->c, old); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); + prt_newline(buf); + } + + printbuf_indent_sub(buf, 2); +} + +noinline __cold +void bch2_dump_trans_updates(struct btree_trans *trans) +{ + struct printbuf buf = PRINTBUF; + + bch2_trans_updates_to_text(&buf, trans); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +{ + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + path->idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', + bch2_btree_ids[path->btree_id], + path->level); + bch2_bpos_to_text(out, path->pos); + + prt_printf(out, " locks %u", path->nodes_locked); +#ifdef CONFIG_BCACHEFS_DEBUG + prt_printf(out, " %pS", (void *) path->ip_allocated); +#endif + prt_newline(out); +} + +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + struct btree_path *path; + unsigned idx; + + trans_for_each_path_inorder(trans, path, idx) + bch2_btree_path_to_text(out, path); } noinline __cold void bch2_dump_trans_paths_updates(struct btree_trans *trans) { - struct btree_path *path; - struct btree_insert_entry *i; - unsigned idx; - char buf1[300], buf2[300]; + struct printbuf buf = PRINTBUF; - btree_trans_verify_sorted(trans); + bch2_trans_paths_to_text(&buf, trans); + bch2_trans_updates_to_text(&buf, trans); - trans_for_each_path_inorder(trans, path, idx) - printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", - path->idx, path->ref, path->intent_ref, - path->should_be_locked ? " S" : "", - path->preserve ? " P" : "", - bch2_btree_ids[path->btree_id], - (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), - path->nodes_locked, -#ifdef CONFIG_BCACHEFS_DEBUG - (void *) path->ip_allocated -#else - NULL -#endif - ); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} - trans_for_each_update(trans, i) { - struct bkey u; - struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); +noinline +static void bch2_trans_update_max_paths(struct btree_trans *trans) +{ + struct btree_transaction_stats *s = btree_trans_stats(trans); + struct printbuf buf = PRINTBUF; - printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", - bch2_btree_ids[i->btree_id], - (void *) i->ip_allocated, - (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), - (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); + bch2_trans_paths_to_text(&buf, trans); + + if (!buf.allocation_failure) { + mutex_lock(&s->lock); + if (s->nr_max_paths < hweight64(trans->paths_allocated)) { + s->nr_max_paths = trans->nr_max_paths = + hweight64(trans->paths_allocated); + swap(s->max_paths_text, buf.buf); + } + mutex_unlock(&s->lock); } + + printbuf_exit(&buf); +} + +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); } -static struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) { struct btree_path *path; unsigned idx; if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { - bch2_dump_trans_paths_updates(trans); - panic("trans path oveflow\n"); - } + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_path_overflow(trans); idx = __ffs64(~trans->paths_allocated); trans->paths_allocated |= 1ULL << idx; + if (unlikely(idx > trans->nr_max_paths)) + bch2_trans_update_max_paths(trans); + path = &trans->paths[idx]; path->idx = idx; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->nodes_intent_locked = 0; btree_path_list_add(trans, pos, path); return path; @@ -1885,6 +1494,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, int i; BUG_ON(trans->restarted); + btree_trans_verify_sorted(trans); + bch2_trans_verify_locks(trans); trans_for_each_path_inorder(trans, path, i) { if (__btree_path_cmp(path, @@ -1916,9 +1527,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - path->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(path->l); i++) - path->l[i].b = BTREE_ITER_NO_NODE_INIT; + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef CONFIG_BCACHEFS_DEBUG path->ip_allocated = ip; #endif @@ -1940,10 +1550,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, */ locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) { - path->locks_want = locks_want; - btree_path_get_locks(trans, path, true); - } + if (locks_want > path->locks_want) + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); return path; } @@ -1951,14 +1559,17 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { + struct btree_path_level *l = path_l(path); + struct bkey_packed *_k; struct bkey_s_c k; - if (!path->cached) { - struct btree_path_level *l = path_l(path); - struct bkey_packed *_k; + if (unlikely(!l->b)) + return bkey_s_c_null; - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!btree_node_locked(path, path->level)); + if (!path->cached) { _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; @@ -1972,13 +1583,9 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct EBUG_ON(ck && (path->btree_id != ck->key.btree_id || bkey_cmp(path->pos, ck->key.pos))); + EBUG_ON(!ck || !ck->valid); - /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ - if (unlikely(!ck || !ck->valid)) - return bkey_s_c_null; - - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - + *u = ck->k->k; k = bkey_i_to_s_c(ck->k); } @@ -2011,7 +1618,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - iter->path->should_be_locked = true; + btree_path_set_should_be_locked(iter->path); return 0; } @@ -2042,8 +1649,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - iter->path->should_be_locked = true; - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(iter->path); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2059,7 +1665,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) struct btree_trans *trans = iter->trans; struct btree_path *path = iter->path; struct btree *b = NULL; - unsigned l; int ret; BUG_ON(trans->restarted); @@ -2072,29 +1677,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* got to end? */ if (!btree_path_node(path, path->level + 1)) { - btree_node_unlock(path, path->level); - path->l[path->level].b = BTREE_ITER_NO_NODE_UP; - path->level++; + btree_path_set_level_up(trans, path); return NULL; } if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - __bch2_btree_path_unlock(path); - path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; - path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; - trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, - path->btree_id, &path->pos); - btree_trans_restart(trans); - ret = -EINTR; + __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; } b = btree_path_node(path, path->level + 1); if (!bpos_cmp(iter->pos, b->key.k.p)) { - btree_node_unlock(path, path->level); - path->l[path->level].b = BTREE_ITER_NO_NODE_UP; - path->level++; + __btree_path_set_level_up(trans, path, path->level++); } else { /* * Haven't gotten to the end of the parent node: go back down to @@ -2105,14 +1705,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - path->level = iter->min_depth; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(path, l); - - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_iter_verify(iter); + btree_path_set_level_down(trans, path, iter->min_depth); ret = bch2_btree_path_traverse(trans, path, iter->flags); if (ret) @@ -2127,7 +1720,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - iter->path->should_be_locked = true; + btree_path_set_should_be_locked(iter->path); BUG_ON(iter->path->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); @@ -2143,15 +1736,23 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - struct bpos pos = iter->k.p; - bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_cmp(pos, SPOS_MAX) - : bkey_cmp(pos, SPOS_MAX)) != 0; + if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { + struct bpos pos = iter->k.p; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, SPOS_MAX) + : bkey_cmp(pos, SPOS_MAX)) != 0; - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } else { + if (!btree_path_node(iter->path, iter->path->level)) + return true; + + iter->advanced = true; + return false; + } } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -2172,34 +1773,47 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, struct bpos pos) { struct btree_insert_entry *i; + struct bkey_i *ret = NULL; - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; + trans_for_each_update(trans, i) { + if (i->btree_id < btree_id) + continue; + if (i->btree_id > btree_id) break; - } + if (bpos_cmp(i->k->k.p, pos) < 0) + continue; + if (i->key_cache_already_flushed) + continue; + if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) + ret = i->k; + } - return NULL; + return ret; } -static noinline -struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, - struct btree_path *path) +struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos start_pos, + struct bpos end_pos) { - struct journal_keys *keys = &trans->c->journal_keys; - size_t idx = bch2_journal_key_search(keys, path->btree_id, - path->level, path->pos); + struct bkey_i *k; + + if (bpos_cmp(start_pos, iter->journal_pos) < 0) + iter->journal_idx = 0; + + k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0, + start_pos, end_pos, + &iter->journal_idx); - while (idx < keys->nr && keys->d[idx].overwritten) - idx++; + iter->journal_pos = k ? k->k.p : end_pos; + return k; +} - return (idx < keys->nr && - keys->d[idx].btree_id == path->btree_id && - keys->d[idx].level == path->level) - ? keys->d[idx].k - : NULL; +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_btree_journal_peek(trans, iter, pos, pos); } static noinline @@ -2208,11 +1822,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i *next_journal = - __btree_trans_peek_journal(trans, iter->path); + bch2_btree_journal_peek(trans, iter, iter->path->pos, + k.k ? k.k->p : iter->path->l[0].b->key.k.p); - if (next_journal && - bpos_cmp(next_journal->k.p, - k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + if (next_journal) { iter->k = next_journal->k; k = bkey_i_to_s_c(next_journal); } @@ -2225,7 +1838,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) { struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; @@ -2249,11 +1862,20 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - iter->key_cache_path->should_be_locked = true; + btree_path_set_should_be_locked(iter->key_cache_path); return bch2_btree_path_peek_slot(iter->key_cache_path, &u); } +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos); + int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_); + + return err ? bkey_s_c_err(err) : ret; +} + static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; @@ -2261,10 +1883,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); while (1) { + struct btree_path_level *l; + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2277,22 +1901,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - iter->path->should_be_locked = true; + l = path_l(iter->path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + + btree_path_set_should_be_locked(iter->path); - k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + k = btree_path_level_peek_all(trans->c, l, &iter->k); if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { - ret = bkey_err(k2); + k = k2; + ret = bkey_err(k); if (ret) { - k = k2; bch2_btree_iter_set_pos(iter, iter->pos); goto out; } - - k = k2; - iter->k = *k.k; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) @@ -2303,7 +1933,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp : NULL; if (next_update && bpos_cmp(next_update->k.p, - k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + k.k ? k.k->p : l->b->key.k.p) <= 0) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); } @@ -2324,9 +1954,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (likely(k.k)) { break; - } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ - search_key = bpos_successor(iter->path->l[0].b->key.k.p); + search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); @@ -2344,16 +1974,19 @@ out: * bch2_btree_iter_peek: returns first key greater than or equal to iterator's * current position */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) { struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; + struct bpos iter_pos; int ret; + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + if (iter->update_path) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); iter->update_path = NULL; } @@ -2362,12 +1995,30 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (!k.k || bkey_err(k)) - goto out; + goto out_no_locked; + + /* + * iter->pos should be mononotically increasing, and always be + * equal to the key we just returned - except extents can + * straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter_pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter_pos = bkey_start_pos(k.k); + else + iter_pos = iter->pos; + + if (bkey_cmp(iter_pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; + } if (iter->update_path && bkey_cmp(iter->update_path->pos, k.k->p)) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); iter->update_path = NULL; } @@ -2394,10 +2045,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) iter->update_path = bch2_btree_path_set_pos(trans, iter->update_path, pos, iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - - BUG_ON(!(iter->update_path->nodes_locked & 1)); - iter->update_path->should_be_locked = true; + _THIS_IP_); } /* @@ -2421,25 +2069,21 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) break; } - /* - * iter->pos should be mononotically increasing, and always be equal to - * the key we just returned - except extents can straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) - iter->pos = k.k->p; - else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - iter->pos = bkey_start_pos(k.k); + iter->pos = iter_pos; iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - BUG_ON(!iter->path->nodes_locked); -out: + + btree_path_set_should_be_locked(iter->path); +out_no_locked: if (iter->update_path) { - BUG_ON(!(iter->update_path->nodes_locked & 1)); - iter->update_path->should_be_locked = true; + if (iter->update_path->uptodate && + (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) + k = bkey_s_c_err(ret); + else + btree_path_set_should_be_locked(iter->update_path); } - iter->path->should_be_locked = true; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) iter->pos.snapshot = iter->snapshot; @@ -2455,6 +2099,100 @@ out: return k; } +/** + * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal + * to iterator's current position, returning keys from every level of the btree. + * For keys at different levels of the btree that compare equal, the key from + * the lower level (leaf) is returned first. + */ +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + BUG_ON(iter->path->level < iter->min_depth); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + /* Already at end? */ + if (!btree_path_node(iter->path, iter->path->level)) { + k = bkey_s_c_null; + goto out_no_locked; + } + + k = btree_path_level_peek_all(trans->c, + &iter->path->l[iter->path->level], &iter->k); + + /* Check if we should go up to the parent node: */ + if (!k.k || + (iter->advanced && + !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; + btree_path_set_level_up(trans, iter->path); + iter->advanced = false; + continue; + } + + /* + * Check if we should go back down to a leaf: + * If we're not in a leaf node, we only return the current key + * if it exactly matches iter->pos - otherwise we first have to + * go back to the leaf: + */ + if (iter->path->level != iter->min_depth && + (iter->advanced || + !k.k || + bpos_cmp(iter->pos, k.k->p))) { + btree_path_set_level_down(trans, iter->path, iter->min_depth); + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + /* Check if we should go to the next key: */ + if (iter->path->level == iter->min_depth && + iter->advanced && + k.k && + !bpos_cmp(iter->pos, k.k->p)) { + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + if (iter->advanced && + iter->path->level == iter->min_depth && + bpos_cmp(k.k->p, iter->pos)) + iter->advanced = false; + + BUG_ON(iter->advanced); + BUG_ON(!k.k); + break; + } + + iter->pos = k.k->p; + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify(iter); + + return k; +} + /** * bch2_btree_iter_next: returns first key greater than iterator's current * position @@ -2503,19 +2241,19 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + goto out_no_locked; } - k = btree_path_level_peek(trans->c, iter->path, + k = btree_path_level_peek(trans, iter->path, &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 : bpos_cmp(k.k->p, search_key) > 0)) - k = btree_path_level_prev(trans->c, iter->path, + k = btree_path_level_prev(trans, iter->path, &iter->path->l[0], &iter->k); - btree_path_check_sort(trans, iter->path, 0); + bch2_btree_path_check_sort(trans, iter->path, 0); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2528,7 +2266,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) * that candidate */ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { - bch2_path_put(trans, iter->path, + bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; saved_path = NULL; @@ -2541,7 +2279,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->snapshot, k.k->p.snapshot)) { if (saved_path) - bch2_path_put(trans, saved_path, + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); @@ -2569,7 +2307,7 @@ got_key: /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; - goto out; + goto out_no_locked; } } @@ -2581,10 +2319,11 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; -out: + + btree_path_set_should_be_locked(iter->path); +out_no_locked: if (saved_path) - bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); - iter->path->should_be_locked = true; + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2611,9 +2350,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; - EBUG_ON(iter->path->level); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2630,8 +2370,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } if ((iter->flags & BTREE_ITER_CACHED) || !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { @@ -2647,29 +2389,38 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && - (next_update = __btree_trans_peek_journal(trans, iter->path)) && - !bpos_cmp(next_update->k.p, iter->pos)) { + (next_update = bch2_btree_journal_peek_slot(trans, + iter, iter->pos))) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; - goto out; + /* We're not returning a key from iter->path: */ + goto out_no_locked; } k = bch2_btree_path_peek_slot(iter->path, &iter->k); + if (unlikely(!k.k)) + goto out_no_locked; } else { struct bpos next; + EBUG_ON(iter->path->level); + if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; + struct bpos end = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + end.offset = U64_MAX; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek(&iter2); + k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { iter->k = iter2.k; @@ -2680,11 +2431,14 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bpos pos = iter->pos; k = bch2_btree_iter_peek(iter); - iter->pos = pos; + if (unlikely(bkey_err(k))) + bch2_btree_iter_set_pos(iter, pos); + else + iter->pos = pos; } if (unlikely(bkey_err(k))) - return k; + goto out_no_locked; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2706,8 +2460,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - iter->path->should_be_locked = true; - + btree_path_set_should_be_locked(iter->path); +out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); ret = bch2_btree_iter_verify_ret(iter, k); @@ -2759,8 +2513,14 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; unsigned i; + if (!bch2_debug_check_iterators) + return; + trans_for_each_path_inorder(trans, path, i) { - BUG_ON(prev && btree_path_cmp(prev, path) > 0); + if (prev && btree_path_cmp(prev, path) > 0) { + bch2_dump_trans_paths_updates(trans); + panic("trans paths out of order!\n"); + } prev = path; } #endif @@ -2777,8 +2537,27 @@ static inline void btree_path_swap(struct btree_trans *trans, btree_path_verify_sorted_ref(trans, r); } -static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, - int cmp) +static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans, + struct btree_path *path, + int cmp) +{ + struct btree_path *n; + int cmp2; + + EBUG_ON(!cmp); + + while ((n = cmp < 0 + ? prev_btree_path(trans, path) + : next_btree_path(trans, path)) && + (cmp2 = btree_path_cmp(n, path)) && + cmp2 != cmp) + btree_path_swap(trans, n, path); + + btree_trans_verify_sorted(trans); +} + +inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, + int cmp) { struct btree_path *n; @@ -2834,6 +2613,11 @@ static inline void btree_path_list_add(struct btree_trans *trans, path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; + if (trans->in_traverse_all && + trans->traverse_all_idx != U8_MAX && + trans->traverse_all_idx >= path->sorted_idx) + trans->traverse_all_idx++; + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); for (i = path->sorted_idx; i < trans->nr_sorted; i++) @@ -2848,7 +2632,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_INTENT); if (iter->update_path) - bch2_path_put(trans, iter->update_path, + bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, @@ -2858,15 +2642,21 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->key_cache_path = NULL; } -static void __bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) +static inline void __bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) { - EBUG_ON(trans->restarted); + if (unlikely(trans->restarted)) + panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); + + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_node_type_is_extents(btree_id)) @@ -2880,15 +2670,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, btree_type_has_snapshots(btree_id)) flags |= BTREE_ITER_FILTER_SNAPSHOTS; - if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + if (trans->journal_replay_not_finished) flags |= BTREE_ITER_WITH_JOURNAL; - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; - iter->trans = trans; iter->path = NULL; iter->update_path = NULL; @@ -2901,6 +2685,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->k.type = KEY_TYPE_deleted; iter->k.p = pos; iter->k.size = 0; + iter->journal_idx = 0; + iter->journal_pos = POS_MIN; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif @@ -2914,6 +2700,12 @@ void bch2_trans_iter_init(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags) { + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + __bch2_trans_iter_init(trans, iter, btree_id, pos, 0, 0, flags, _RET_IP_); } @@ -2946,36 +2738,34 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = NULL; } -void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - size_t new_top = trans->mem_top + size; + unsigned new_top = trans->mem_top + size; + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(new_top); + void *new_mem; void *p; - if (new_top > trans->mem_bytes) { - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); - void *new_mem; + trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); - new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - kfree(trans->mem); - } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } - if (!new_mem) - return ERR_PTR(-ENOMEM); + if (!new_mem) + return ERR_PTR(-ENOMEM); - trans->mem = new_mem; - trans->mem_bytes = new_bytes; + trans->mem = new_mem; + trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); - btree_trans_restart(trans); - return ERR_PTR(-EINTR); - } + if (old_bytes) { + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } p = trans->mem + trans->mem_top; @@ -2988,30 +2778,23 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset * - * While iterating over nodes or updating nodes a attempt to lock a btree - * node may return EINTR when the trylock fails. When this occurs - * bch2_trans_begin() should be called and the transaction retried. + * While iterating over nodes or updating nodes a attempt to lock a btree node + * may return BCH_ERR_transaction_restart when the trylock fails. When this + * occurs bch2_trans_begin() should be called and the transaction retried. */ -void bch2_trans_begin(struct btree_trans *trans) +u32 bch2_trans_begin(struct btree_trans *trans) { - struct btree_insert_entry *i; struct btree_path *path; - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); + bch2_trans_reset_updates(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - trans->extra_journal_res = 0; - trans->nr_updates = 0; + trans->restart_count++; trans->mem_top = 0; - trans->hooks = NULL; - trans->extra_journal_entries = NULL; - trans->extra_journal_entry_u64s = 0; - if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; - memset(&trans->fs_usage_deltas->memset_start, 0, + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, (void *) &trans->fs_usage_deltas->memset_end - (void *) &trans->fs_usage_deltas->memset_start); } @@ -3019,6 +2802,14 @@ void bch2_trans_begin(struct btree_trans *trans) trans_for_each_path(trans, path) { path->should_be_locked = false; + /* + * If the transaction wasn't restarted, we're presuming to be + * doing something new: dont keep iterators excpt the ones that + * are in use - except for the subvolumes btree: + */ + if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) + path->preserve = false; + /* * XXX: we probably shouldn't be doing this if the transaction * was restarted, but currently we still overflow transaction @@ -3026,16 +2817,32 @@ void bch2_trans_begin(struct btree_trans *trans) */ if (!path->ref && !path->preserve) __bch2_path_free(trans, path); - else if (!path->ref) + else path->preserve = false; } - bch2_trans_cond_resched(trans); + if (!trans->restarted && + (need_resched() || + local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + bch2_trans_unlock(trans); + cond_resched(); + bch2_trans_relock(trans); + } + trans->last_restarted_ip = _RET_IP_; if (trans->restarted) bch2_btree_path_traverse_all(trans); - trans->restarted = false; + trans->last_begin_time = local_clock(); + return trans->restart_count; +} + +void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_restarted_ip); } static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) @@ -3047,7 +2854,7 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) BUG_ON(trans->used_mempool); #ifdef __KERNEL__ - p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); + p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); #endif if (!p) p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); @@ -3056,35 +2863,71 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) trans->updates = p; p += updates_bytes; } -void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, - unsigned expected_nr_iters, - size_t expected_mem_bytes, - const char *fn) +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; + +unsigned bch2_trans_get_fn_idx(const char *fn) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + if (!bch2_btree_transaction_fns[i] || + bch2_btree_transaction_fns[i] == fn) { + bch2_btree_transaction_fns[i] = fn; + return i; + } + + pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); + return i; +} + +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { + struct btree_transaction_stats *s; + struct btree_trans *pos; + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); memset(trans, 0, sizeof(*trans)); trans->c = c; - trans->fn = fn; + trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) + ? bch2_btree_transaction_fns[fn_idx] : NULL; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + closure_init_stack(&trans->ref); bch2_trans_alloc_paths(trans, c); - if (expected_mem_bytes) { - trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); - trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + s = btree_trans_stats(trans); + if (s) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); if (!unlikely(trans->mem)) { trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); trans->mem_bytes = BTREE_TRANS_MEM_MAX; + } else { + trans->mem_bytes = expected_mem_bytes; } + + trans->nr_max_paths = s->nr_max_paths; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->pid = current->pid; mutex_lock(&c->btree_trans_lock); - list_add(&trans->list, &c->btree_trans_list); + list_for_each_entry(pos, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: mutex_unlock(&c->btree_trans_lock); } @@ -3115,9 +2958,15 @@ void bch2_trans_exit(struct btree_trans *trans) { struct btree_insert_entry *i; struct bch_fs *c = trans->c; + struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); + closure_sync(&trans->ref); + + if (s) + s->max_mem = max(s->max_mem, trans->mem_max); + trans_for_each_update(trans, i) __btree_path_put(i->path, true); trans->nr_updates = 0; @@ -3132,6 +2981,8 @@ void bch2_trans_exit(struct btree_trans *trans) bch2_journal_preres_put(&c->journal, &trans->journal_preres); + kfree(trans->extra_journal_entries.data); + if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -3161,86 +3012,84 @@ void bch2_trans_exit(struct btree_trans *trans) } static void __maybe_unused -bch2_btree_path_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *_b, - bool cached) +bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) { - pr_buf(out, " l=%u %s:", - _b->level, bch2_btree_ids[_b->btree_id]); - bch2_bpos_to_text(out, btree_node_pos(_b, cached)); -} + struct six_lock_count c = six_lock_counts(&b->lock); + struct task_struct *owner; + pid_t pid; -static bool trans_has_locks(struct btree_trans *trans) -{ - struct btree_path *path; + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + rcu_read_unlock(); - trans_for_each_path(trans, path) - if (path->nodes_locked) - return true; - return false; + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + b->level, bch2_btree_ids[b->btree_id]); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_tab(out); + prt_printf(out, " locks %u:%u:%u held by pid %u", + c.n[0], c.n[1], c.n[2], pid); } -void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_trans *trans; struct btree_path *path; - struct btree *b; + struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; - mutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - if (!trans_has_locks(trans)) - continue; + if (!out->nr_tabstops) { + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 32); + } - pr_buf(out, "%i %s\n", trans->pid, trans->fn); + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); - trans_for_each_path(trans, path) { - if (!path->nodes_locked) - continue; + trans_for_each_path(trans, path) { + if (!path->nodes_locked) + continue; - pr_buf(out, " path %u %c l=%u %s:", - path->idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_ids[path->btree_id]); - bch2_bpos_to_text(out, path->pos); - pr_buf(out, "\n"); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l)) { - pr_buf(out, " %s l=%u ", - btree_node_intent_locked(path, l) ? "i" : "r", l); - bch2_btree_path_node_to_text(out, - (void *) path->l[l].b, - path->cached); - pr_buf(out, "\n"); - } + prt_printf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(path, l) && + !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { + prt_printf(out, " %c l=%u ", + lock_types[btree_node_locked_type(path, l)], l); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } } + } - b = READ_ONCE(trans->locking); - if (b) { - path = &trans->paths[trans->locking_path_idx]; - pr_buf(out, " locking path %u %c l=%u %c %s:", - trans->locking_path_idx, - path->cached ? 'c' : 'b', - trans->locking_level, - lock_types[trans->locking_lock_type], - bch2_btree_ids[trans->locking_btree_id]); - bch2_bpos_to_text(out, trans->locking_pos); - - pr_buf(out, " node "); - bch2_btree_path_node_to_text(out, - (void *) b, path->cached); - pr_buf(out, "\n"); - } + b = READ_ONCE(trans->locking); + if (b) { + prt_str(out, " want"); + prt_newline(out); + prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } - mutex_unlock(&c->btree_trans_lock); } void bch2_fs_btree_iter_exit(struct bch_fs *c) { + struct btree_transaction_stats *s; + + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) + kfree(s->max_paths_text); + if (c->btree_trans_barrier_initialized) cleanup_srcu_struct(&c->btree_trans_barrier); mempool_exit(&c->btree_trans_mem_pool); @@ -3249,9 +3098,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) int bch2_fs_btree_iter_init(struct bch_fs *c) { - unsigned nr = BTREE_ITER_MAX; + unsigned i, nr = BTREE_ITER_MAX; int ret; + for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++) + mutex_init(&c->btree_transaction_stats[i].lock); + INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 759c7b5..0775cfa 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -5,6 +5,8 @@ #include "bset.h" #include "btree_types.h" +#include + static inline void __btree_path_get(struct btree_path *path, bool intent) { path->ref++; @@ -70,11 +72,16 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) return &trans->paths[idx]; } -#define trans_for_each_path(_trans, _path) \ - for (_path = __trans_next_path((_trans), 0); \ +void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); + +#define trans_for_each_path_from(_trans, _path, _start) \ + for (_path = __trans_next_path((_trans), _start); \ (_path); \ _path = __trans_next_path((_trans), (_path)->idx + 1)) +#define trans_for_each_path(_trans, _path) \ + trans_for_each_path_from(_trans, _path, 0) + static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { unsigned idx = path ? path->sorted_idx + 1 : 0; @@ -124,9 +131,20 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, _path = __trans_next_path_with_node((_trans), (_b), \ (_path)->idx + 1)) -struct btree_path * __must_check -bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool, unsigned long); + +static inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + if (path->ref > 1 || path->preserve) + path = __bch2_btree_path_make_mut(trans, path, intent, ip); + path->should_be_locked = false; + return path; +} + struct btree_path * __must_check bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, struct bpos, bool, unsigned long); @@ -136,14 +154,18 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo unsigned, unsigned, unsigned, unsigned long); inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, + struct btree_iter *, struct bpos); + +inline void bch2_btree_path_level_init(struct btree_trans *, + struct btree_path *, struct btree *); + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); -void bch2_trans_verify_locks(struct btree_trans *); void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos, bool); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) {} #endif @@ -154,46 +176,50 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, struct btree *, struct btree_node_iter *, struct bkey_packed *, unsigned, unsigned); -bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); +int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); void bch2_path_put(struct btree_trans *, struct btree_path *, bool); -bool bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); +bool bch2_trans_locked(struct btree_trans *); -__always_inline -static inline int btree_trans_restart(struct btree_trans *trans) +static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) { - trans->restarted = true; - bch2_trans_unlock(trans); - return -EINTR; + return restart_count != trans->restart_count; } -bool bch2_btree_node_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); +void bch2_trans_verify_not_restarted(struct btree_trans *, u32); -static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) +__always_inline +static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) { - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + BUG_ON(err <= 0); + BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart)); - return path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want) - : path->uptodate == BTREE_ITER_UPTODATE; + trans->restarted = err; + return -err; } -void __bch2_btree_path_downgrade(struct btree_path *, unsigned); +__always_inline +static inline int btree_trans_restart(struct btree_trans *trans, int err) +{ + btree_trans_restart_nounlock(trans, err); + return -err; +} + +bool bch2_btree_node_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); -static inline void bch2_btree_path_downgrade(struct btree_path *path) +static inline void bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path) { unsigned new_locks_want = path->level + !!path->intent_ref; if (path->locks_want > new_locks_want) - __bch2_btree_path_downgrade(path, new_locks_want); + __bch2_btree_path_downgrade(trans, path, new_locks_want); } void bch2_trans_downgrade(struct btree_trans *); @@ -207,9 +233,16 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); + +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_upto(iter, SPOS_MAX); +} + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -267,11 +300,28 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - iter->path->preserve = false; + if (!iter->trans->restarted) + iter->path->preserve = false; } -void *bch2_trans_kmalloc(struct btree_trans *, size_t); -void bch2_trans_begin(struct btree_trans *); +void *__bch2_trans_kmalloc(struct btree_trans *, size_t); + +static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + unsigned new_top = trans->mem_top + size; + void *p = trans->mem + trans->mem_top; + + if (likely(new_top <= trans->mem_bytes)) { + trans->mem_top += size; + memset(p, 0, size); + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + + } +} + +u32 bch2_trans_begin(struct btree_trans *); static inline struct btree * __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) @@ -279,7 +329,7 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter struct btree *b; while (b = bch2_btree_iter_peek_node(iter), - PTR_ERR_OR_ZERO(b) == -EINTR) + bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) bch2_trans_begin(trans); return b; @@ -303,18 +353,44 @@ static inline int bkey_err(struct bkey_s_c k) return PTR_ERR_OR_ZERO(k.k); } +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, + unsigned flags) +{ + BUG_ON(flags & BTREE_ITER_ALL_LEVELS); + + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek_prev(iter); +} + static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, - unsigned flags) + unsigned flags) { - return flags & BTREE_ITER_SLOTS - ? bch2_btree_iter_peek_slot(iter) - : bch2_btree_iter_peek(iter); + return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : + flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek(iter); +} + +static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + if (!(flags & BTREE_ITER_SLOTS)) + return bch2_btree_iter_peek_upto(iter, end); + + if (bkey_cmp(iter->pos, end) > 0) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); } static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 - ? -EINTR : 0; + if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { + trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); + } + + return 0; } static inline struct bkey_s_c @@ -325,12 +401,124 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, while (btree_trans_too_many_iters(trans) || (k = bch2_btree_iter_peek_type(iter, flags), - bkey_err(k) == -EINTR)) + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) bch2_trans_begin(trans); return k; } +#define lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count; \ + int _ret; \ + \ + do { \ + _restart_count = bch2_trans_begin(_trans); \ + _ret = (_do); \ + } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ + \ + if (!_ret) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + _ret; \ +}) + +/* + * nested_lockrestart_do(), nested_commit_do(): + * + * These are like lockrestart_do() and commit_do(), with two differences: + * + * - We don't call bch2_trans_begin() unless we had a transaction restart + * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a + * transaction restart + */ +#define nested_lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count, _orig_restart_count; \ + int _ret; \ + \ + _restart_count = _orig_restart_count = (_trans)->restart_count; \ + \ + while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ + _restart_count = bch2_trans_begin(_trans); \ + \ + if (!_ret) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ + _ret = -BCH_ERR_transaction_restart_nested; \ + \ + _ret; \ +}) + +#define for_each_btree_key2(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ + if (!(_k).k) { \ + _ret = 0; \ + break; \ + } \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + +#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ + if (!(_k).k) { \ + _ret = 0; \ + break; \ + } \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_rewind(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + +#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -347,6 +535,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ for (; \ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ @@ -361,14 +557,28 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, /* new multiple iterator interface: */ +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); -void __bch2_trans_init(struct btree_trans *, struct bch_fs *, - unsigned, size_t, const char *); +void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); void bch2_trans_exit(struct btree_trans *); -#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) +extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +unsigned bch2_trans_get_fn_idx(const char *); + +#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ +do { \ + static unsigned trans_fn_idx; \ + \ + if (unlikely(!trans_fn_idx)) \ + trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ + \ + __bch2_trans_init(_trans, _c, trans_fn_idx); \ +} while (0) -void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); int bch2_fs_btree_iter_init(struct bch_fs *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 928aab6..cd52dd5 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_cache.h" @@ -5,6 +6,7 @@ #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update.h" +#include "errcode.h" #include "error.h" #include "journal.h" #include "journal_reclaim.h" @@ -12,6 +14,11 @@ #include #include +static inline bool btree_uses_pcpu_readers(enum btree_id id) +{ + return id == BTREE_ID_subvolumes; +} + static struct kmem_cache *bch2_key_cache; static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -83,26 +90,185 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - list_move_tail(&ck->list, &bc->freed); - bc->nr_freed++; + if (ck->c.lock.readers) + list_move_tail(&ck->list, &bc->freed_pcpu); + else + list_move_tail(&ck->list, &bc->freed_nonpcpu); + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bkey_cached *pos; + + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { + if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, + pos->btree_trans_barrier_seq)) { + list_move(&ck->list, &pos->list); + return; + } + } + + list_move(&ck->list, &bc->freed_nonpcpu); +} + +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct btree_key_cache_freelist *f; + bool freed = false; + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + if (!ck->c.lock.readers) { +#ifdef __KERNEL__ + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + __bkey_cached_move_to_freelist_ordered(bc, ck2); + } + preempt_enable(); + + __bkey_cached_move_to_freelist_ordered(bc, ck); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_nonpcpu); + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); + mutex_unlock(&bc->lock); + } +} + +static void bkey_cached_free_fast(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_del_init(&ck->list); + atomic_long_inc(&bc->nr_freed); kfree(ck->k); ck->k = NULL; ck->u64s = 0; + bkey_cached_move_to_freelist(bc, ck); + six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } static struct bkey_cached * -bkey_cached_alloc(struct btree_key_cache *c) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path) { - struct bkey_cached *ck; + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck = NULL; + struct btree_key_cache_freelist *f; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + + if (!pcpu_readers) { +#ifdef __KERNEL__ + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (!list_empty(&bc->freed_nonpcpu) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + f->objs[f->nr++] = ck; + } + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_nonpcpu)) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { + ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); + } + + if (ck) { + int ret; + + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent); + if (unlikely(ret)) { + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + path->l[0].b = (void *) ck; + path->l[0].lock_seq = ck->c.lock.state.seq; + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + + ret = bch2_btree_node_lock_write(trans, path, &ck->c); + if (unlikely(ret)) { + btree_node_unlock(trans, path, 0); + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + return ck; + } + + /* GFP_NOFS because we're holding btree locks: */ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); + __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key); + if (pcpu_readers) + six_lock_pcpu_alloc(&ck->c.lock); + + ck->c.cached = true; BUG_ON(!six_trylock_intent(&ck->c.lock)); BUG_ON(!six_trylock_write(&ck->c.lock)); return ck; @@ -120,15 +286,6 @@ bkey_cached_reuse(struct btree_key_cache *c) unsigned i; mutex_lock(&c->lock); - list_for_each_entry_reverse(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) { - c->nr_freed--; - list_del(&ck->list); - mutex_unlock(&c->lock); - return ck; - } - mutex_unlock(&c->lock); - rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) @@ -136,46 +293,47 @@ bkey_cached_reuse(struct btree_key_cache *c) if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(c, ck); - rcu_read_unlock(); - return ck; + goto out; } } + ck = NULL; +out: rcu_read_unlock(); - - return NULL; + mutex_unlock(&c->lock); + return ck; } static struct bkey_cached * -btree_key_cache_create(struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) { + struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck; bool was_new = true; - ck = bkey_cached_alloc(bc); + ck = bkey_cached_alloc(trans, path); + if (IS_ERR(ck)) + return ck; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_ids[btree_id]); + bch2_btree_ids[path->btree_id]); return ERR_PTR(-ENOMEM); } + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); was_new = false; + } else { + if (path->btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); } - if (btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); - else - six_lock_pcpu_free(&ck->c.lock); - ck->c.level = 0; - ck->c.btree_id = btree_id; - ck->key.btree_id = btree_id; - ck->key.pos = pos; + ck->c.btree_id = path->btree_id; + ck->key.btree_id = path->btree_id; + ck->key.pos = path->pos; ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; @@ -187,11 +345,10 @@ btree_key_cache_create(struct bch_fs *c, if (likely(was_new)) { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); kfree(ck); } else { - mutex_lock(&bc->lock); - bkey_cached_free(bc, ck); - mutex_unlock(&bc->lock); + bkey_cached_free_fast(bc, ck); } return NULL; @@ -224,9 +381,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, k = bch2_btree_path_peek_slot(path, &u); if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_trans_restart_relock_key_cache_fill(trans->fn, - _THIS_IP_, ck_path->btree_id, &ck_path->pos); - ret = btree_trans_restart(trans); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); goto err; } @@ -236,6 +392,13 @@ static int btree_key_cache_fill(struct btree_trans *trans, */ new_u64s = k.k->u64s + 1; + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + new_u64s = min(256U, (new_u64s * 3) / 2); + if (new_u64s > ck->u64s) { new_u64s = roundup_pow_of_two(new_u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); @@ -247,11 +410,12 @@ static int btree_key_cache_fill(struct btree_trans *trans, } } - /* - * XXX: not allowed to be holding read locks when we take a write lock, - * currently - */ - bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); + ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); + if (ret) { + kfree(new_k); + goto err; + } + if (new_k) { kfree(ck->k); ck->u64s = new_u64s; @@ -269,18 +433,9 @@ err: return ret; } -static int bkey_cached_check_fn(struct six_lock *lock, void *p) -{ - struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); - const struct btree_path *path = p; - - return ck->key.btree_id == path->btree_id && - !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; -} - -__flatten -int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static noinline int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_cached *ck; @@ -297,32 +452,24 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) { - if (flags & BTREE_ITER_CACHED_NOCREATE) { - path->l[0].b = NULL; - return 0; - } - - ck = btree_key_cache_create(c, path->btree_id, path->pos); + ck = btree_key_cache_create(trans, path); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; if (!ck) goto retry; - mark_btree_node_locked(path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, - lock_want, - bkey_cached_check_fn, path, _THIS_IP_)) { - if (!trans->restarted) - goto retry; - - ret = -EINTR; + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto err; - } + + BUG_ON(ret); if (ck->key.btree_id != path->btree_id || bpos_cmp(ck->key.pos, path->pos)) { @@ -330,17 +477,21 @@ retry: goto retry; } - mark_btree_node_locked(path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, lock_want); } path->l[0].lock_seq = ck->c.lock.state.seq; path->l[0].b = (void *) ck; fill: - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + if (!ck->valid) { + /* + * Using the underscore version because we haven't set + * path->uptodate yet: + */ if (!path->locks_want && !__bch2_btree_path_upgrade(trans, path, 1)) { - trace_transaction_restart_ip(trans->fn, _THIS_IP_); - ret = btree_trans_restart(trans); + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); goto err; } @@ -353,17 +504,72 @@ fill: set_bit(BKEY_CACHED_ACCESSED, &ck->flags); path->uptodate = BTREE_ITER_UPTODATE; + BUG_ON(!ck->valid); BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); return ret; err: - if (ret != -EINTR) { - btree_node_unlock(path, 0); - path->l[0].b = BTREE_ITER_NO_NODE_ERROR; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); } return ret; } +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + EBUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret) + return ret; + + if (ck->key.btree_id != path->btree_id || + bpos_cmp(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].b = (void *) ck; +fill: + if (!ck->valid) + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + path->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(!ck->valid); + EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; +} + static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bkey_cached_key key, u64 journal_seq, @@ -382,8 +588,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_ALL_SNAPSHOTS); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; @@ -410,7 +614,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * Since journal reclaim depends on us making progress here, and the * allocator/copygc depend on journal reclaim making progress, we need * to be using alloc reserves: - * */ + */ ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| @@ -421,16 +625,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) - ? BTREE_INSERT_JOURNAL_RESERVED + ? JOURNAL_WATERMARK_reserved : 0)| commit_flags); - if (ret) { - bch2_fs_fatal_err_on(ret != -EINTR && - ret != -EAGAIN && - !bch2_journal_error(j), c, - "error flushing key cache: %i", ret); + + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && + !bch2_journal_error(j), c, + "error flushing key cache: %s", bch2_err_str(ret)); + if (ret) goto out; - } bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); @@ -443,24 +648,22 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, atomic_long_dec(&c->btree_key_cache.nr_dirty); } } else { + struct btree_path *path2; evict: - BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); + trans_for_each_path(trans, path2) + if (path2 != c_iter.path) + __bch2_btree_path_unlock(trans, path2); - mark_btree_node_unlocked(c_iter.path, 0); - c_iter.path->l[0].b = NULL; - - six_lock_write(&ck->c.lock, NULL, NULL); + bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } + mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); - - mutex_lock(&c->btree_key_cache.lock); - bkey_cached_free(&c->btree_key_cache, ck); - mutex_unlock(&c->btree_key_cache.lock); + bkey_cached_free_fast(&c->btree_key_cache, ck); } out: bch2_trans_iter_exit(trans, &b_iter); @@ -475,11 +678,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; + struct btree_trans trans; + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int ret = 0; - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + bch2_trans_init(&trans, c, 0, 0); - six_lock_read(&ck->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); key = ck->key; if (ck->journal.seq != seq || @@ -489,12 +694,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, btree_key_cache_flush_pos(&trans, key, seq, BTREE_INSERT_JOURNAL_RECLAIM, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + bch2_trans_exit(&trans); return ret; } @@ -555,13 +761,26 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, return true; } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, - enum btree_id id, struct bpos pos) +void bch2_btree_key_cache_drop(struct btree_trans *trans, + struct btree_path *path) { - BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + + BUG_ON(!ck->valid); + + /* + * We just did an update to the btree, bypassing the key cache: the key + * cache key is now stale and must be dropped, even if dirty: + */ + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + bch2_journal_pin_drop(&c->journal, &ck->journal); + } + + ck->valid = false; } -#endif static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct shrink_control *sc) @@ -575,12 +794,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, unsigned start, flags; int srcu_idx; - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - return -1; - + mutex_lock(&bc->lock); srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -588,14 +802,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ - list_for_each_entry_safe(ck, t, &bc->freed, list) { + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) break; list_del(&ck->list); + six_lock_pcpu_free(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); - bc->nr_freed--; + atomic_long_dec(&bc->nr_freed); + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_pcpu_free(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); scanned++; freed++; } @@ -668,23 +899,45 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) struct bkey_cached *ck, *n; struct rhash_head *pos; unsigned i; +#ifdef __KERNEL__ + int cpu; +#endif if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); mutex_lock(&bc->lock); - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed); - } - rcu_read_unlock(); + /* + * The loop is needed to guard against racing with rehash: + */ + while (atomic_long_read(&bc->nr_keys)) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed_nonpcpu); + } + rcu_read_unlock(); + } + +#ifdef __KERNEL__ + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); - list_for_each_entry_safe(ck, n, &bc->freed, list) { + for (i = 0; i < f->nr; i++) { + ck = f->objs[i]; + list_add(&ck->list, &bc->freed_nonpcpu); + } + } +#endif + + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); @@ -692,53 +945,80 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_del(&ck->list); kfree(ck->k); + six_lock_pcpu_free(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); } - BUG_ON(atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)); - BUG_ON(atomic_long_read(&bc->nr_keys)); + if (atomic_long_read(&bc->nr_dirty) && + !bch2_journal_error(&c->journal) && + test_bit(BCH_FS_WAS_RW, &c->flags)) + panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", + atomic_long_read(&bc->nr_dirty)); + + if (atomic_long_read(&bc->nr_keys)) + panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", + atomic_long_read(&bc->nr_keys)); mutex_unlock(&bc->lock); if (bc->table_init_done) rhashtable_destroy(&bc->table); + + free_percpu(bc->pcpu_freed); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed); + INIT_LIST_HEAD(&c->freed_pcpu); + INIT_LIST_HEAD(&c->freed_nonpcpu); } -int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) +static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) { + struct btree_key_cache *bc = + container_of(shrink, struct btree_key_cache, shrink); + + bch2_btree_key_cache_to_text(out, bc); +} + +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); int ret; - ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); +#ifdef __KERNEL__ + bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!bc->pcpu_freed) + return -ENOMEM; +#endif + + ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params); if (ret) return ret; - c->table_init_done = true; + bc->table_init_done = true; - c->shrink.seeks = 1; - c->shrink.count_objects = bch2_btree_key_cache_count; - c->shrink.scan_objects = bch2_btree_key_cache_scan; - return register_shrinker(&c->shrink); + bc->shrink.seeks = 0; + bc->shrink.count_objects = bch2_btree_key_cache_count; + bc->shrink.scan_objects = bch2_btree_key_cache_scan; + bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; + return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name); } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); - pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys)); - pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty)); + prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); + prt_newline(out); + prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + prt_newline(out); + prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_newline(out); } void bch2_btree_key_cache_exit(void) { - if (bch2_key_cache) - kmem_cache_destroy(bch2_key_cache); + kmem_cache_destroy(bch2_key_cache); } int __init bch2_btree_key_cache_init(void) diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index b3d241b..670746e 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *, struct btree_path *, struct bkey_i *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_key_cache_verify_clean(struct btree_trans *, - enum btree_id, struct bpos); -#else -static inline void -bch2_btree_key_cache_verify_clean(struct btree_trans *trans, - enum btree_id id, struct bpos pos) {} -#endif +void bch2_btree_key_cache_drop(struct btree_trans *, + struct btree_path *); void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c new file mode 100644 index 0000000..9d09043 --- /dev/null +++ b/libbcachefs/btree_locking.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_types.h" + +struct lock_class_key bch2_btree_node_lock_key; + +/* Btree node locking: */ + +static inline void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) + this_cpu_add(*lock->readers, nr); + else if (nr > 0) + atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter); + else + atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter); +} + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, + struct btree_path *skip, + struct btree_bkey_cached_common *b, + unsigned level) +{ + struct btree_path *path; + struct six_lock_count ret; + + memset(&ret, 0, sizeof(ret)); + + if (IS_ERR_OR_NULL(b)) + return ret; + + trans_for_each_path(trans, path) + if (path != skip && &path->l[level].b->c == b) { + int t = btree_node_locked_type(path, level); + + if (t != BTREE_NODE_UNLOCKED) + ret.n[t]++; + } + + return ret; +} + +/* unlock */ + +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) +{ + bch2_btree_node_unlock_write_inlined(trans, path, b); +} + +/* lock */ + +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_printf(out, "Found lock cycle (%u entries):", g->nr); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} + +static noinline void print_chain(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i != g->g + g->nr; i++) { + if (i != g->g) + prt_str(out, "<- "); + prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + } + prt_newline(out); +} + +static void lock_graph_up(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; +} + +static bool lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr > i) + lock_graph_up(g); + return true; + } + + return false; +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + if (i == g->g) { + trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + wake_up_process(i->trans->locking_wait.task); + return 0; + } +} + +static int btree_trans_abort_preference(struct btree_trans *trans) +{ + if (trans->lock_may_not_fail) + return 0; + if (trans->locking_wait.lock_want == SIX_LOCK_write) + return 1; + if (!trans->in_traverse_all) + return 2; + return 3; +} + +static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) +{ + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; + + if (lock_graph_remove_non_waiters(g)) + return 0; + + /* Only checking, for debugfs: */ + if (cycle) { + print_cycle(cycle, g); + ret = -1; + goto out; + } + + for (i = g->g; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; + best = pref; + } + } + + if (unlikely(!best)) { + struct bch_fs *c = g->g->trans->c; + struct printbuf buf = PRINTBUF; + + bch_err(c, "cycle of nofail locks"); + + for (i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + bch2_prt_backtrace(&buf, trans->locking_wait.task); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); + } + + ret = abort_lock(g, abort); +out: + if (ret) + while (g->nr) + lock_graph_up(g); + return ret; +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + struct printbuf *cycle) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) + return break_cycle(g, cycle); + + if (g->nr == ARRAY_SIZE(g->g)) { + if (orig_trans->lock_may_not_fail) + return 0; + + while (g->nr) + lock_graph_up(g); + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + } + + lock_graph_down(g, trans); + return 0; +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + int ret; + + if (trans->lock_must_abort) { + trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + } + + g.nr = 0; + lock_graph_down(&g, trans); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_from(top->trans, path, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (top->path_idx != path->idx) { + top->path_idx = path->idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (IS_ERR_OR_NULL(b)) { + BUG_ON(!lock_graph_remove_non_waiters(&g)); + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + ret = lock_graph_descend(&g, trans, cycle); + raw_spin_unlock(&b->lock.wait_lock); + + if (ret) + return ret; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + if (g.nr > 1 && cycle) + print_chain(cycle, &g); + lock_graph_up(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return bch2_check_for_deadlock(trans, NULL); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; + + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + six_lock_readers_add(&b->lock, -readers); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); + six_lock_readers_add(&b->lock, readers); + + if (ret) + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); + + return ret; +} + +/* relock */ + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade) +{ + unsigned l = path->level; + int fail_idx = -1; + + do { + if (!btree_path_node(path, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) + fail_idx = l; + + l++; + } while (l < path->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ + if (fail_idx >= 0) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + --fail_idx; + } while (fail_idx >= 0); + } + + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + + return path->uptodate < BTREE_ITER_NEED_RELOCK; +} + +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level, + bool trace) +{ + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); + + if (race_fault()) + goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, want))) { + mark_btree_node_locked(trans, path, level, want); + return true; + } +fail: + if (trace) + trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); + return false; +} + +/* upgrade */ + +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree *b = path->l[level].b; + struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); + + if (!is_btree_node(path, level)) + return false; + + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + case BTREE_NODE_WRITE_LOCKED: + BUG(); + } + + if (btree_node_intent_locked(path, level)) + return true; + + if (race_fault()) + return false; + + if (btree_node_locked(path, level)) { + bool ret; + + six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); + ret = six_lock_tryupgrade(&b->c.lock); + six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); + + if (ret) + goto success; + } else { + if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + } + + /* + * Do we already have an intent lock via another path? If so, just bump + * lock count: + */ + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); + goto success; + } + + trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); + return false; +success: + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + return true; +} + +/* Btree path locking: */ + +/* + * Only for btree_cache.c - only relocks intent locks + */ +int bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned l; + + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } + } + + return 0; +} + +__flatten +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_path_get_locks(trans, path, false); +} + +__flatten +bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_path_get_locks(trans, path, true); +} + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + EBUG_ON(path->locks_want >= new_locks_want); + + path->locks_want = new_locks_want; + + return btree_path_get_locks(trans, path, true); +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + struct btree_path *linked; + + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) + return true; + + /* + * XXX: this is ugly - we'd prefer to not be mucking with other + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and + * calling get_locks() is sufficient to make bch2_btree_path_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did + * to this iterator isn't going to be preserved. + * + * Possibly we could add an iterator field for the parent iterator when + * an iterator is a copy - for now, we'll just upgrade any other + * iterators with the same btree id. + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ + if (!path->cached && !trans->in_traverse_all) + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_path_get_locks(trans, linked, true); + } + + return false; +} + +void __bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + unsigned l; + + EBUG_ON(path->locks_want < new_locks_want); + + path->locks_want = new_locks_want; + + while (path->nodes_locked && + (l = btree_path_highest_level_locked(path)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(trans, path, l); + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + mark_btree_node_locked_noreset(path, l, SIX_LOCK_read); + } + break; + } + } + + bch2_btree_path_verify_locks(path); +} + +/* Btree transaction locking: */ + +void bch2_trans_downgrade(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(trans, path); +} + +int bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +void bch2_trans_unlock(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); + + /* + * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking + * btree nodes, it implements its own walking: + */ + EBUG_ON(!trans->is_initial_gc && + lock_class_is_held(&bch2_btree_node_lock_key)); +} + +bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->nodes_locked) + return true; + return false; +} + +/* Debug */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +void bch2_btree_path_verify_locks(struct btree_path *path) +{ + unsigned l; + + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); + return; + } + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + int want = btree_lock_want(path, l); + int have = btree_node_locked_type(path, l); + + BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); + + BUG_ON(is_btree_node(path, l) && + (want == BTREE_NODE_UNLOCKED || + have != BTREE_NODE_WRITE_LOCKED) && + want != have); + } +} + +void bch2_trans_verify_locks(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); +} + +#endif diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index b4434ec..bf8d188 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -14,66 +14,82 @@ #include "btree_iter.h" +extern struct lock_class_key bch2_btree_node_lock_key; + +static inline bool is_btree_node(struct btree_path *path, unsigned l) +{ + return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); +} + +static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) +{ + return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) + ? &trans->c->btree_transaction_stats[trans->fn_idx] + : NULL; +} + /* matches six lock types */ enum btree_node_locked_type { BTREE_NODE_UNLOCKED = -1, BTREE_NODE_READ_LOCKED = SIX_LOCK_read, BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, + BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, }; static inline int btree_node_locked_type(struct btree_path *path, unsigned level) { - /* - * We're relying on the fact that if nodes_intent_locked is set - * nodes_locked must be set as well, so that we can compute without - * branches: - */ - return BTREE_NODE_UNLOCKED + - ((path->nodes_locked >> level) & 1) + - ((path->nodes_intent_locked >> level) & 1); + return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } -static inline bool btree_node_intent_locked(struct btree_path *path, - unsigned level) +static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { - return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; + return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; } -static inline bool btree_node_read_locked(struct btree_path *path, - unsigned level) +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) { - return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; + return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; } -static inline bool btree_node_locked(struct btree_path *path, unsigned level) +static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) { - return path->nodes_locked & (1 << level); + return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; } -static inline void mark_btree_node_unlocked(struct btree_path *path, - unsigned level) +static inline bool btree_node_locked(struct btree_path *path, unsigned level) { - path->nodes_locked &= ~(1 << level); - path->nodes_intent_locked &= ~(1 << level); + return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; } -static inline void mark_btree_node_locked(struct btree_path *path, - unsigned level, - enum six_lock_type type) +static inline void mark_btree_node_locked_noreset(struct btree_path *path, + unsigned level, + enum btree_node_locked_type type) { /* relying on this to avoid a branch */ BUILD_BUG_ON(SIX_LOCK_read != 0); BUILD_BUG_ON(SIX_LOCK_intent != 1); - path->nodes_locked |= 1 << level; - path->nodes_intent_locked |= type << level; + path->nodes_locked &= ~(3U << (level << 1)); + path->nodes_locked |= (type + 1) << (level << 1); } -static inline void mark_btree_node_intent_locked(struct btree_path *path, - unsigned level) +static inline void mark_btree_node_unlocked(struct btree_path *path, + unsigned level) +{ + EBUG_ON(btree_node_write_locked(path, level)); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); +} + +static inline void mark_btree_node_locked(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + enum six_lock_type type) { - mark_btree_node_locked(path, level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, level, type); +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[level].lock_taken_time = local_clock(); +#endif } static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) @@ -95,161 +111,308 @@ btree_lock_want(struct btree_path *path, int level) return BTREE_NODE_UNLOCKED; } -static inline void btree_node_unlock(struct btree_path *path, unsigned level) +static void btree_trans_lock_hold_time_update(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + struct btree_transaction_stats *s = btree_trans_stats(trans); + + if (s) + __bch2_time_stats_update(&s->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); +#endif +} + +/* unlock: */ + +static inline void btree_node_unlock(struct btree_trans *trans, + struct btree_path *path, unsigned level) { int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); - if (lock_type != BTREE_NODE_UNLOCKED) + if (lock_type != BTREE_NODE_UNLOCKED) { six_unlock_type(&path->l[level].b->c.lock, lock_type); + btree_trans_lock_hold_time_update(trans, path, level); + } mark_btree_node_unlocked(path, level); } -static inline void __bch2_btree_path_unlock(struct btree_path *path) +static inline int btree_path_lowest_level_locked(struct btree_path *path) +{ + return __ffs(path->nodes_locked) >> 1; +} + +static inline int btree_path_highest_level_locked(struct btree_path *path) +{ + return __fls(path->nodes_locked) >> 1; +} + +static inline void __bch2_btree_path_unlock(struct btree_trans *trans, + struct btree_path *path) { btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) - btree_node_unlock(path, __ffs(path->nodes_locked)); + btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); } -static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) { - switch (type) { - case SIX_LOCK_read: - return BCH_TIME_btree_lock_contended_read; - case SIX_LOCK_intent: - return BCH_TIME_btree_lock_contended_intent; - case SIX_LOCK_write: - return BCH_TIME_btree_lock_contended_write; - default: - BUG(); - } + struct btree_path *linked; + + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + + trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq += 2; + + six_unlock_write(&b->c.lock); } -static inline bool btree_node_lock_type(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct bch_fs *c = trans->c; - u64 start_time; - bool ret; +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); - if (six_trylock_type(&b->c.lock, type)) - return true; +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); - start_time = local_clock(); +/* lock: */ - trans->locking_path_idx = path->idx; - trans->locking_pos = pos; - trans->locking_btree_id = path->btree_id; - trans->locking_level = level; - trans->locking_lock_type = type; - trans->locking = b; - ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; - trans->locking = NULL; +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail) +{ + int ret; - if (ret) - bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); + trans->lock_may_not_fail = lock_may_not_fail; + trans->lock_must_abort = false; + trans->locking = b; + ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); return ret; } +static inline int __must_check +btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + return __btree_node_lock_nopath(trans, b, type, false); +} + +static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + int ret = __btree_node_lock_nopath(trans, b, type, true); + + BUG_ON(ret); +} + /* * Lock a btree node if we already have it locked on one of our linked * iterators: */ static inline bool btree_node_lock_increment(struct btree_trans *trans, - struct btree *b, unsigned level, + struct btree_bkey_cached_common *b, + unsigned level, enum btree_node_locked_type want) { struct btree_path *path; trans_for_each_path(trans, path) - if (path->l[level].b == b && + if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->c.lock, want); + six_lock_increment(&b->lock, want); return true; } return false; } -bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, - struct btree *, struct bpos, unsigned, - enum six_lock_type, - six_lock_should_sleep_fn, void *, - unsigned long); - -static inline bool btree_node_lock(struct btree_trans *trans, +static inline int btree_node_lock(struct btree_trans *trans, struct btree_path *path, - struct btree *b, struct bpos pos, unsigned level, + struct btree_bkey_cached_common *b, + unsigned level, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { + int ret = 0; + EBUG_ON(level >= BTREE_MAX_DEPTH); EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - return likely(six_trylock_type(&b->c.lock, type)) || - btree_node_lock_increment(trans, b, level, type) || - __bch2_btree_node_lock(trans, path, b, pos, level, type, - should_sleep_fn, p, ip); + if (likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(trans, b, level, type) || + !(ret = btree_node_lock_nopath(trans, b, type))) { +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[b->level].lock_taken_time = local_clock(); +#endif + } + + return ret; } -bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, + struct btree_bkey_cached_common *b, bool); + +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + EBUG_ON(&path->l[b->level].b->c != b); + EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); + EBUG_ON(!btree_node_intent_locked(path, b->level)); + + /* + * six locks are unfair, and read locks block while a thread wants a + * write lock: thus, we need to tell the cycle detector we have a write + * lock _before_ taking the lock: + */ + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); + + return likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); +} + +static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + int ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); +} + +static inline int __must_check +bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + return __btree_node_lock_write(trans, path, b, false); +} + +/* relock: */ + +bool bch2_btree_path_relock_norestart(struct btree_trans *, + struct btree_path *, unsigned long); +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); static inline bool bch2_btree_node_relock(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - btree_node_locked_type(path, level) != - __btree_lock_want(path, level)); + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || - __bch2_btree_node_relock(trans, path, level); + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, true)); } -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -static inline void -bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) +static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree_path *linked; + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, false)); +} - trans_for_each_path_with_node(trans, b, linked) - linked->l[b->c.level].lock_seq += 2; +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } - six_unlock_write(&b->c.lock); + return 0; } -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); +/* upgrade */ -void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, + struct btree_path *, unsigned); +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); -static inline void bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); - EBUG_ON(!btree_node_intent_locked(path, b->c.level)); + unsigned old_locks_want = path->locks_want; + + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE) + return 0; - if (unlikely(!six_trylock_write(&b->c.lock))) - __bch2_btree_node_lock_write(trans, b); + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, + old_locks_want, new_locks_want); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); } -#endif /* _BCACHEFS_BTREE_LOCKING_H */ +/* misc: */ + +static inline void btree_path_set_should_be_locked(struct btree_path *path) +{ + EBUG_ON(!btree_node_locked(path, path->level)); + EBUG_ON(path->uptodate); + + path->should_be_locked = true; +} +static inline void __btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path, + unsigned l) +{ + btree_node_unlock(trans, path, l); + path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); +} +static inline void btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path) +{ + __btree_path_set_level_up(trans, path, path->level++); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +} + +/* debug */ + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *b, + unsigned); + +int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_path_verify_locks(struct btree_path *); +void bch2_trans_verify_locks(struct btree_trans *); +#else +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +#endif + +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 68272f2..892d123 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -6,8 +6,9 @@ #include #include -#include "bkey_methods.h" +//#include "bkey_methods.h" #include "buckets_types.h" +#include "darray.h" #include "journal_types.h" struct open_bucket; @@ -62,6 +63,7 @@ struct btree_bkey_cached_common { struct six_lock lock; u8 level; u8 btree_id; + bool cached; }; struct btree { @@ -152,11 +154,22 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; atomic_t dirty; struct shrinker shrink; @@ -180,22 +193,16 @@ struct btree_node_iter { * Iterate over all possible positions, synthesizing deleted keys for holes: */ #define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_ALL_LEVELS (1 << 1) /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 1) +#define BTREE_ITER_INTENT (1 << 2) /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 2) -/* - * Indicates that this iterator should not be reused until transaction commit, - * either because a pending update references it or because the update depends - * on that particular key being locked (e.g. by the str_hash code, for hash - * table consistency) - */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos @@ -203,15 +210,13 @@ struct btree_node_iter { #define BTREE_ITER_IS_EXTENTS (1 << 4) #define BTREE_ITER_NOT_EXTENTS (1 << 5) #define BTREE_ITER_CACHED (1 << 6) -#define BTREE_ITER_CACHED_NOFILL (1 << 7) -#define BTREE_ITER_CACHED_NOCREATE (1 << 8) -#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) -#define BTREE_ITER_WITH_UPDATES (1 << 10) -#define BTREE_ITER_WITH_JOURNAL (1 << 11) -#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) -#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) -#define BTREE_ITER_NOPRESERVE (1 << 15) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 7) +#define BTREE_ITER_WITH_UPDATES (1 << 8) +#define BTREE_ITER_WITH_JOURNAL (1 << 9) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 10) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 11) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12) +#define BTREE_ITER_NOPRESERVE (1 << 13) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -219,15 +224,6 @@ enum btree_path_uptodate { BTREE_ITER_NEED_TRAVERSE = 2, }; -#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) -#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) -#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) -#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) -#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) -#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) -#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) -#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) - struct btree_path { u8 idx; u8 sorted_idx; @@ -247,14 +243,16 @@ struct btree_path { */ bool should_be_locked:1; unsigned level:3, - locks_want:4, - nodes_locked:4, - nodes_intent_locked:4; + locks_want:4; + u8 nodes_locked; struct btree_path_level { struct btree *b; struct btree_node_iter iter; u32 lock_seq; +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + u64 lock_taken_time; +#endif } l[BTREE_MAX_DEPTH]; #ifdef CONFIG_BCACHEFS_DEBUG unsigned long ip_allocated; @@ -280,7 +278,8 @@ struct btree_iter { struct btree_path *key_cache_path; enum btree_id btree_id:4; - unsigned min_depth:4; + unsigned min_depth:3; + unsigned advanced:1; /* btree_iter_copy starts here: */ u16 flags; @@ -295,20 +294,31 @@ struct btree_iter { * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; + + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; #ifdef CONFIG_BCACHEFS_DEBUG unsigned long ip_allocated; #endif }; +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; struct shrinker shrink; unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; - size_t nr_freed; + atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; }; @@ -325,7 +335,7 @@ struct bkey_cached { struct btree_bkey_cached_common c; unsigned long flags; - u8 u64s; + u16 u64s; bool valid; u32 btree_trans_barrier_seq; struct bkey_cached_key key; @@ -339,16 +349,32 @@ struct bkey_cached { struct bkey_i *k; }; +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + struct btree_insert_entry { unsigned flags; u8 bkey_type; enum btree_id btree_id:8; - u8 level; + u8 level:4; bool cached:1; bool insert_trigger_run:1; bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; struct bkey_i *k; struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; unsigned long ip_allocated; }; @@ -366,36 +392,48 @@ struct btree_trans_commit_hook { struct btree_trans_commit_hook *next; }; -#define BTREE_TRANS_MEM_MAX (1U << 14) +#define BTREE_TRANS_MEM_MAX (1U << 16) + +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 struct btree_trans { struct bch_fs *c; const char *fn; + struct closure ref; struct list_head list; - struct btree *locking; - unsigned locking_path_idx; - struct bpos locking_pos; - u8 locking_btree_id; - u8 locking_level; - u8 locking_lock_type; - pid_t pid; + u64 last_begin_time; + + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; + u8 fn_idx; u8 nr_sorted; u8 nr_updates; + u8 traverse_all_idx; bool used_mempool:1; bool in_traverse_all:1; - bool restarted:1; - bool journal_transaction_names:1; + bool memory_allocation_failure:1; + bool is_initial_gc:1; + bool journal_replay_not_finished:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_restarted_ip; + /* * For when bch2_trans_update notices we'll be splitting a compressed * extent: */ unsigned extra_journal_res; + unsigned nr_max_paths; u64 paths_allocated; unsigned mem_top; + unsigned mem_max; unsigned mem_bytes; void *mem; @@ -405,8 +443,7 @@ struct btree_trans { /* update path: */ struct btree_trans_commit_hook *hooks; - struct jset_entry *extra_journal_entries; - unsigned extra_journal_entry_u64s; + DARRAY(u64) extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; @@ -419,7 +456,31 @@ struct btree_trans { struct replicas_delta_list *fs_usage_deltas; }; -#define BTREE_FLAG(flag) \ +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x +}; + +#define x(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ @@ -429,36 +490,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } -enum btree_flags { - BTREE_NODE_read_in_flight, - BTREE_NODE_read_error, - BTREE_NODE_dirty, - BTREE_NODE_need_write, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_write_in_flight_inner, - BTREE_NODE_just_written, - BTREE_NODE_dying, - BTREE_NODE_fake, - BTREE_NODE_need_rewrite, - BTREE_NODE_never_write, -}; - -BTREE_FLAG(read_in_flight); -BTREE_FLAG(read_error); -BTREE_FLAG(need_write); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(write_in_flight_inner); -BTREE_FLAG(just_written); -BTREE_FLAG(dying); -BTREE_FLAG(fake); -BTREE_FLAG(need_rewrite); -BTREE_FLAG(never_write); +BTREE_FLAGS() +#undef x static inline struct btree_write *btree_current_write(struct btree *b) { @@ -588,24 +621,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - switch (type) { - case BKEY_TYPE_extents: - case BKEY_TYPE_reflink: - return true; - default: - return false; - } -} - -static inline bool btree_node_is_extents(struct btree *b) -{ - return btree_node_type_is_extents(btree_node_type(b)); -} - #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ (1U << BKEY_TYPE_inodes)| \ (1U << BKEY_TYPE_stripes)| \ (1U << BKEY_TYPE_reflink)| \ @@ -621,6 +639,16 @@ static inline bool btree_node_is_extents(struct btree *b) (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + return (1U << type) & BTREE_ID_IS_EXTENTS; +} + #define BTREE_ID_HAS_SNAPSHOTS \ ((1U << BTREE_ID_extents)| \ (1U << BTREE_ID_inodes)| \ @@ -636,40 +664,10 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; } -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) - -#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ - ((1U << KEY_TYPE_alloc)| \ - (1U << KEY_TYPE_alloc_v2)| \ - (1U << KEY_TYPE_alloc_v3)| \ - (1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)| \ - (1U << KEY_TYPE_inode_v2)| \ - (1U << KEY_TYPE_snapshot)) +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_PTRS; +} static inline bool btree_node_type_needs_gc(enum btree_node_type type) { diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index d9a406a..1c2e7b2 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -8,20 +8,20 @@ struct bch_fs; struct btree; -void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, - struct btree *); +void bch2_btree_node_prep_for_write(struct btree_trans *, + struct btree_path *, struct btree *); bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, struct btree *, struct btree_node_iter *, struct bkey_i *); void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { - __BTREE_INSERT_NOFAIL, + /* First two bits for journal watermark: */ + __BTREE_INSERT_NOFAIL = 2, __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_JOURNAL_RECLAIM, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, @@ -41,9 +41,6 @@ enum btree_insert_flags { /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -/* Indicates that we have pre-reserved space in the journal: */ -#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) - /* Insert is being called from journal reclaim path: */ #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) @@ -54,6 +51,8 @@ enum btree_insert_flags { #define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) #define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) +int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, + unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); @@ -83,13 +82,14 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); +int bch2_trans_log_msg(struct btree_trans *, const char *); + /** * bch2_trans_commit - insert keys at given iterator positions * * This is main entry point for btree updates. * * Return values: - * -EINTR: locking changed, this function should be called again. * -EROFS: filesystem read only * -EIO: journal or btree node IO error */ @@ -105,30 +105,33 @@ static inline int bch2_trans_commit(struct btree_trans *trans, return __bch2_trans_commit(trans); } -#define lockrestart_do(_trans, _do) \ +#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ({ \ + struct btree_trans trans; \ int _ret; \ \ - do { \ - bch2_trans_begin(_trans); \ - _ret = (_do); \ - } while (_ret == -EINTR); \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ + bch2_trans_exit(&trans); \ \ _ret; \ }) -#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +#define bch2_trans_run(_c, _do) \ ({ \ struct btree_trans trans; \ int _ret; \ \ bch2_trans_init(&trans, (_c), 0, 0); \ - _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ - _do); \ + _ret = (_do); \ bch2_trans_exit(&trans); \ \ _ret; \ @@ -139,4 +142,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +static inline void bch2_trans_reset_updates(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->hooks = NULL; + trans->extra_journal_entries.nr = 0; +} + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 088c320..40debf7 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -23,11 +23,27 @@ #include #include -static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_path *, struct btree *, - struct keylist *, unsigned); +static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, + struct btree_path *, struct btree *, + struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + struct btree_path *path; + + path = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_INTENT, _RET_IP_); + path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path; +} + /* Debug code: */ /* @@ -41,7 +57,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) struct bkey_s_c k; struct bkey_s_c_btree_ptr_v2 bp; struct bkey unpacked; - char buf1[100], buf2[100]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; BUG_ON(!b->c.level); @@ -58,9 +74,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) if (bpos_cmp(next_node, bp.v->min_key)) { bch2_dump_btree_node(c, b); - panic("expected next min_key %s got %s\n", - (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), - (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); + bch2_bpos_to_text(&buf1, next_node); + bch2_bpos_to_text(&buf2, bp.v->min_key); + panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); } bch2_btree_node_iter_advance(&iter, b); @@ -68,9 +84,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) if (bch2_btree_node_iter_end(&iter)) { if (bpos_cmp(k.k->p, b->key.k.p)) { bch2_dump_btree_node(c, b); - panic("expected end %s got %s\n", - (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), - (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); + bch2_bpos_to_text(&buf1, b->key.k.p); + bch2_bpos_to_text(&buf2, k.k->p); + panic("expected end %s got %s\n", buf1.buf, buf2.buf); } break; } @@ -143,7 +159,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, static void __btree_node_free(struct bch_fs *c, struct btree *b) { - trace_btree_node_free(c, b); + trace_and_count(c, btree_node_free, c, b); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_need_write(b)); @@ -160,29 +176,69 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) } static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { struct bch_fs *c = trans->c; - struct btree_path *path; + unsigned level = b->c.level; + + bch2_btree_node_lock_write_nofail(trans, path, &b->c); + bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); trans_for_each_path(trans, path) - BUG_ON(path->l[b->c.level].b == b && - path->l[b->c.level].lock_seq == b->c.lock.state.seq); + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} - six_lock_write(&b->c.lock, NULL, NULL); +static void bch2_btree_node_free_never_used(struct btree_update *as, + struct btree_trans *trans, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; + struct btree_path *path; + unsigned level = b->c.level; + + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); + + b->will_make_reachable = 0; + closure_put(&as->cl); + clear_btree_node_will_make_reachable(b); + clear_btree_node_accessed(b); + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + + mutex_lock(&c->btree_cache.lock); + list_del_init(&b->list); bch2_btree_node_hash_remove(&c->btree_cache, b); - __btree_node_free(c, b); + mutex_unlock(&c->btree_cache.lock); + + BUG_ON(p->nr >= ARRAY_SIZE(p->b)); + p->b[p->nr++] = b; - six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } } -static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, + bool interior_node, unsigned flags) { + struct bch_fs *c = trans->c; struct write_point *wp; struct btree *b; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; @@ -193,10 +249,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, if (flags & BTREE_INSERT_USE_RESERVE) { nr_reserve = 0; - alloc_reserve = RESERVE_BTREE_MOVINGGC; + alloc_reserve = RESERVE_btree_movinggc; } else { nr_reserve = BTREE_NODE_RESERVE; - alloc_reserve = RESERVE_BTREE; + alloc_reserve = RESERVE_btree; } mutex_lock(&c->btree_reserve_cache_lock); @@ -212,7 +268,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, + wp = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -242,7 +298,7 @@ retry: bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, interior_node); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -256,22 +312,25 @@ mem_alloc: return b; } -static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) +static struct btree *bch2_btree_node_alloc(struct btree_update *as, + struct btree_trans *trans, + unsigned level) { struct bch_fs *c = as->c; struct btree *b; + struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; int ret; BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!as->nr_prealloc_nodes); + BUG_ON(!p->nr); - b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + b = p->b[--p->nr]; - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); set_btree_node_accessed(b); - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); @@ -301,7 +360,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_btree_node_alloc(c, b); + trace_and_count(c, btree_node_alloc, c, b); return b; } @@ -319,12 +378,13 @@ static void btree_set_max(struct btree *b, struct bpos pos) } struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, struct btree *b, struct bkey_format format) { struct btree *n; - n = bch2_btree_node_alloc(as, b->c.level); + n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); @@ -343,6 +403,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, } static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, struct btree *b) { struct bkey_format new_f = bch2_btree_calc_format(b); @@ -354,12 +415,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, if (!bch2_btree_node_format_fits(as->c, b, &new_f)) new_f = b->format; - return __bch2_btree_node_alloc_replacement(as, b, new_f); + return __bch2_btree_node_alloc_replacement(as, trans, b, new_f); } -static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) +static struct btree *__btree_root_alloc(struct btree_update *as, + struct btree_trans *trans, unsigned level) { - struct btree *b = bch2_btree_node_alloc(as, level); + struct btree *b = bch2_btree_node_alloc(as, trans, level); btree_set_min(b, POS_MIN); btree_set_max(b, SPOS_MAX); @@ -368,56 +430,57 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) btree_node_set_format(b, b->data->format); bch2_btree_build_aux_trees(b); - bch2_btree_update_add_new_node(as, b); - six_unlock_write(&b->c.lock); - return b; } -static void bch2_btree_reserve_put(struct btree_update *as) +static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; + struct prealloc_nodes *p; - mutex_lock(&c->btree_reserve_cache_lock); + for (p = as->prealloc_nodes; + p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); + p++) { + while (p->nr) { + struct btree *b = p->b[--p->nr]; - while (as->nr_prealloc_nodes) { - struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + mutex_lock(&c->btree_reserve_cache_lock); - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + a->ob = b->ob; + b->ob.nr = 0; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_buckets_put(c, &b->ob); + } - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } + mutex_unlock(&c->btree_reserve_cache_lock); - __btree_node_free(c, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } } - - mutex_unlock(&c->btree_reserve_cache_lock); } -static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, - unsigned flags) +static int bch2_btree_reserve_get(struct btree_trans *trans, + struct btree_update *as, + unsigned nr_nodes[2], + unsigned flags, + struct closure *cl) { struct bch_fs *c = as->c; - struct closure cl; struct btree *b; - int ret; - - closure_init_stack(&cl); -retry: + unsigned interior; + int ret = 0; - BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node @@ -426,39 +489,33 @@ retry: * BTREE_INSERT_NOWAIT only applies to btree node allocation, not * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(c, cl); if (ret) - goto err; + return ret; + + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { + b = __bch2_btree_node_alloc(trans, &as->disk_res, + flags & BTREE_INSERT_NOWAIT ? NULL : cl, + interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; + } - while (as->nr_prealloc_nodes < nr_nodes) { - b = __bch2_btree_node_alloc(c, &as->disk_res, - flags & BTREE_INSERT_NOWAIT - ? NULL : &cl, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err; + p->b[p->nr++] = b; } - - as->prealloc_nodes[as->nr_prealloc_nodes++] = b; } - - bch2_btree_cache_cannibalize_unlock(c); - closure_sync(&cl); - return 0; err: bch2_btree_cache_cannibalize_unlock(c); - closure_sync(&cl); - - if (ret == -EAGAIN) - goto retry; - - trace_btree_reserve_get_fail(c, nr_nodes, &cl); return ret; } /* Asynchronous interior node update machinery */ -static void bch2_btree_update_free(struct btree_update *as) +static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; @@ -471,7 +528,7 @@ static void bch2_btree_update_free(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); - bch2_btree_reserve_put(as); + bch2_btree_reserve_put(as, trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], as->start_time); @@ -492,20 +549,18 @@ static void bch2_btree_update_free(struct btree_update *as) mutex_unlock(&c->btree_interior_update_lock); } -static void btree_update_will_delete_key(struct btree_update *as, - struct bkey_i *k) +static void btree_update_add_key(struct btree_update *as, + struct keylist *keys, struct btree *b) { - BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > + struct bkey_i *k = &b->key; + + BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > ARRAY_SIZE(as->_old_keys)); - bch2_keylist_add(&as->old_keys, k); -} -static void btree_update_will_add_key(struct btree_update *as, - struct bkey_i *k) -{ - BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > - ARRAY_SIZE(as->_new_keys)); - bch2_keylist_add(&as->new_keys, k); + bkey_copy(keys->top, k); + bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; + + bch2_keylist_push(keys); } /* @@ -518,24 +573,29 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, struct bkey_i *k; int ret; - trans->extra_journal_entries = (void *) &as->journal_entries[0]; - trans->extra_journal_entry_u64s = as->journal_u64s; + ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + if (ret) + return ret; + + memcpy(&darray_top(trans->extra_journal_entries), + as->journal_entries, + as->journal_u64s * sizeof(u64)); + trans->extra_journal_entries.nr += as->journal_u64s; + trans->journal_pin = &as->journal; - for_each_keylist_key(&as->new_keys, k) { - ret = bch2_trans_mark_key(trans, - bkey_s_c_null, - bkey_i_to_s_c(k), - BTREE_TRIGGER_INSERT); + for_each_keylist_key(&as->old_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); if (ret) return ret; } - for_each_keylist_key(&as->old_keys, k) { - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(k), - bkey_s_c_null, - BTREE_TRIGGER_OVERWRITE); + for_each_keylist_key(&as->new_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); if (ret) return ret; } @@ -546,12 +606,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; - struct btree *b = as->b; + struct btree *b; struct btree_trans trans; u64 journal_seq = 0; unsigned i; int ret; + bch2_trans_init(&trans, c, 0, 512); /* * If we're already in an error state, it might be because a btree node * was never written, and we might be trying to free that same btree @@ -563,22 +624,21 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; - BUG_ON(!journal_pin_active(&as->journal)); - /* * Wait for any in flight writes to finish before we free the old nodes * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - struct btree *old = as->old_nodes[i]; __le64 seq; - six_lock_read(&old->c.lock, NULL, NULL); - seq = old->data ? old->data->keys.seq : 0; - six_unlock_read(&old->c.lock); + b = as->old_nodes[i]; + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + seq = b->data ? b->data->keys.seq : 0; + six_unlock_read(&b->c.lock); if (seq == as->old_nodes_seq[i]) - wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner, + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -595,19 +655,23 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - bch2_trans_init(&trans, c, 0, 512); - ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, - btree_update_nodes_written_trans(&trans, as)); - bch2_trans_exit(&trans); + ret = commit_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RECLAIM| + JOURNAL_WATERMARK_reserved, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_unlock(&trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "error %i in btree_update_nodes_written()", ret); err: - if (b) { + if (as->b) { + struct btree_path *path; + + b = as->b; + path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); /* * @b is the node we did the final insert into: * @@ -620,11 +684,28 @@ err: * we're in journal error state: */ - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + /* + * Ensure transaction is unlocked before using + * btree_node_lock_nopath() (the use of which is always suspect, + * we need to work on removing this in the future) + * + * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * calls bch2_path_upgrade(), before we call path_make_mut(), so + * we may rarely end up with a locked path besides the one we + * have here: + */ + bch2_trans_unlock(&trans); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); + mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(&trans, path, b); + + bch2_btree_node_lock_write_nofail(&trans, path, &b->c); + mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); + if (list_empty(&b->write_blocked)) + clear_btree_node_write_blocked(b); /* * Node might have been freed, recheck under @@ -638,8 +719,8 @@ err: if (!ret) { i->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(i->journal_seq))); + max(journal_seq, + le64_to_cpu(i->journal_seq))); bch2_btree_add_journal_pin(c, b, journal_seq); } else { @@ -653,10 +734,13 @@ err: } mutex_unlock(&c->btree_interior_update_lock); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); six_unlock_write(&b->c.lock); btree_node_write_if_need(c, b, SIX_LOCK_intent); - six_unlock_intent(&b->c.lock); + btree_node_unlock(&trans, path, b->c.level); + bch2_path_put(&trans, path, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -669,13 +753,14 @@ err: BUG_ON(b->will_make_reachable != (unsigned long) as); b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(b); } mutex_unlock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - six_lock_read(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -683,7 +768,8 @@ err: for (i = 0; i < as->nr_open_buckets; i++) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - bch2_btree_update_free(as); + bch2_btree_update_free(as, &trans); + bch2_trans_exit(&trans); } static void btree_interior_update_work(struct work_struct *work) @@ -735,6 +821,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) as->mode = BTREE_INTERIOR_UPDATING_NODE; as->b = b; + + set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); mutex_unlock(&c->btree_interior_update_lock); @@ -800,10 +888,19 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree as->new_nodes[as->nr_new_nodes++] = b; b->will_make_reachable = 1UL|(unsigned long) as; + set_btree_node_will_make_reachable(b); mutex_unlock(&c->btree_interior_update_lock); - btree_update_will_add_key(as, &b->key); + btree_update_add_key(as, &as->new_keys, b); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; + unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; + + bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = + cpu_to_le16(sectors); + } } /* @@ -822,6 +919,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) * xchg() is for synchronization with bch2_btree_complete_write: */ v = xchg(&b->will_make_reachable, 0); + clear_btree_node_will_make_reachable(b); as = (struct btree_update *) (v & ~1UL); if (!as) { @@ -855,7 +953,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b * btree_updates to point to this btree_update: */ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, - struct btree *b) + struct btree *b) { struct bch_fs *c = as->c; struct btree_update *p, *n; @@ -887,7 +985,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, closure_wake_up(&c->btree_interior_update_wait); } - clear_btree_node_dirty(c, b); + clear_btree_node_dirty_acct(c, b); clear_btree_node_need_write(b); /* @@ -919,14 +1017,14 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, */ btree_update_drop_new_node(c, b); - btree_update_will_delete_key(as, &b->key); + btree_update_add_key(as, &as->old_keys, b); as->old_nodes[as->nr_old_nodes] = b; as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; as->nr_old_nodes++; } -static void bch2_btree_update_done(struct btree_update *as) +static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; u64 start_time = as->start_time; @@ -937,7 +1035,7 @@ static void bch2_btree_update_done(struct btree_update *as) up_read(&as->c->gc_lock); as->took_gc_lock = false; - bch2_btree_reserve_put(as); + bch2_btree_reserve_put(as, trans); continue_at(&as->cl, btree_update_set_nodes_written, as->c->btree_interior_update_worker); @@ -948,32 +1046,44 @@ static void bch2_btree_update_done(struct btree_update *as) static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, unsigned nr_nodes, unsigned flags) + unsigned level, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; u64 start_time = local_clock(); int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; - int journal_flags = 0; + unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; + int journal_flags = flags & JOURNAL_WATERMARK_MASK; int ret = 0; + u32 restart_count = trans->restart_count; BUG_ON(!path->should_be_locked); - if (flags & BTREE_INSERT_JOURNAL_RESERVED) - journal_flags |= JOURNAL_RES_GET_RESERVED; if (flags & BTREE_INSERT_JOURNAL_RECLAIM) journal_flags |= JOURNAL_RES_GET_NONBLOCK; - /* - * XXX: figure out how far we might need to split, - * instead of locking/reserving all the way to the root: - */ - if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, - path->btree_id, &path->pos); - ret = btree_trans_restart(trans); - return ERR_PTR(ret); + while (1) { + nr_nodes[!!update_level] += 1 + split; + update_level++; + + ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + if (ret) + return ERR_PTR(ret); + + if (!btree_path_node(path, update_level)) { + /* Allocating new root? */ + nr_nodes[1] += split; + update_level = BTREE_MAX_DEPTH; + break; + } + + if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + break; + + split = true; } if (flags & BTREE_INSERT_GC_LOCK_HELD) @@ -981,9 +1091,10 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, else if (!down_read_trylock(&c->gc_lock)) { bch2_trans_unlock(trans); down_read(&c->gc_lock); - if (!bch2_trans_relock(trans)) { + ret = bch2_trans_relock(trans); + if (ret) { up_read(&c->gc_lock); - return ERR_PTR(-EINTR); + return ERR_PTR(ret); } } @@ -995,6 +1106,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->mode = BTREE_INTERIOR_NO_UPDATE; as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); as->btree_id = path->btree_id; + as->update_level = update_level; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1018,41 +1130,66 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - bch2_trans_unlock(trans); - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, - journal_flags); + journal_flags|JOURNAL_RES_GET_NONBLOCK); if (ret) { - bch2_btree_update_free(as); - trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); - btree_trans_restart(trans); - return ERR_PTR(ret); + bch2_trans_unlock(trans); + + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags); + if (ret) { + trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) + goto err; } ret = bch2_disk_reservation_get(c, &as->disk_res, - nr_nodes * btree_sectors(c), + (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, disk_res_flags); if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags); - if (ret) - goto err; + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + if (bch2_err_matches(ret, ENOSPC) || + bch2_err_matches(ret, ENOMEM)) { + struct closure cl; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); - if (!bch2_trans_relock(trans)) { - ret = -EINTR; + bch2_trans_unlock(trans); + closure_sync(&cl); + } while (ret == -EAGAIN); + } + + if (ret) { + trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]); goto err; } - bch2_journal_pin_add(&c->journal, - atomic64_read(&c->journal.seq), - &as->journal, NULL); + ret = bch2_trans_relock(trans); + if (ret) + goto err; + bch2_trans_verify_not_restarted(trans, restart_count); return as; err: - bch2_btree_update_free(as); + bch2_btree_update_free(as, trans); return ERR_PTR(ret); } @@ -1065,11 +1202,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) list_del_init(&b->list); mutex_unlock(&c->btree_cache.lock); - if (b->c.level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free(&b->c.lock); - mutex_lock(&c->btree_root_lock); BUG_ON(btree_node_root(c, b) && (b->c.level < btree_node_root(c, b)->c.level || @@ -1101,9 +1233,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_btree_set_root(c, b); - BUG_ON(!b->written && - !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); + trace_and_count(c, btree_node_set_root, c, b); old = btree_node_root(c, b); @@ -1111,7 +1241,7 @@ static void bch2_btree_set_root(struct btree_update *as, * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write(trans, path, old); + bch2_btree_node_lock_write_nofail(trans, path, &old->c); bch2_btree_set_root_inmem(c, b); @@ -1138,7 +1268,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, { struct bch_fs *c = as->c; struct bkey_packed *k; - const char *invalid; + struct printbuf buf = PRINTBUF; BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); @@ -1146,13 +1276,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: - bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); - if (invalid) { - char buf[160]; - - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); - bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); + if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_printf(&buf, "\n "); + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf); + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); + + bch2_fs_inconsistent(c, "%s", buf.buf); dump_stack(); } @@ -1170,8 +1305,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_node_iter_advance(node_iter, b); bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); + + printbuf_exit(&buf); } static void @@ -1203,6 +1340,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, * node) */ static struct btree *__btree_split_node(struct btree_update *as, + struct btree_trans *trans, struct btree *n1) { struct bkey_format_state s; @@ -1212,8 +1350,7 @@ static struct btree *__btree_split_node(struct btree_update *as, struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; struct bpos n1_pos; - n2 = bch2_btree_node_alloc(as, n1->c.level); - bch2_btree_update_add_new_node(as, n2); + n2 = bch2_btree_node_alloc(as, trans, n1->c.level); n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; @@ -1361,38 +1498,49 @@ static void btree_split_insert_keys(struct btree_update *as, btree_node_interior_verify(as->c, b); } -static void btree_split(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, - struct keylist *keys, unsigned flags) +static int btree_split(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; + struct btree_path *path1 = NULL, *path2 = NULL; u64 start_time = local_clock(); + int ret = 0; BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); + BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); bch2_btree_interior_update_will_free_node(as, b); - n1 = bch2_btree_node_alloc_replacement(as, b); - bch2_btree_update_add_new_node(as, n1); + n1 = bch2_btree_node_alloc_replacement(as, trans, b); if (keys) btree_split_insert_keys(as, trans, path, n1, keys); if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { - trace_btree_split(c, b); + trace_and_count(c, btree_node_split, c, b); - n2 = __btree_split_node(as, n1); + n2 = __btree_split_node(as, trans, n1); bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n1); + + bch2_btree_update_add_new_node(as, n1); + bch2_btree_update_add_new_node(as, n2); six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - bch2_btree_node_write(c, n1, SIX_LOCK_intent); - bch2_btree_node_write(c, n2, SIX_LOCK_intent); + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + + path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + six_lock_increment(&n2->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n2); /* * Note that on recursive parent_keys == keys, so we @@ -1404,22 +1552,33 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, if (!parent) { /* Depth increases, make a new root */ - n3 = __btree_root_alloc(as, b->c.level + 1); + n3 = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n3); + six_unlock_write(&n3->c.lock); + + path2->locks_want++; + BUG_ON(btree_node_locked(path2, n3->c.level)); + six_lock_increment(&n3->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n3); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); - - bch2_btree_node_write(c, n3, SIX_LOCK_intent); } } else { - trace_btree_compact(c, b); + trace_and_count(c, btree_node_compact, c, b); bch2_btree_build_aux_trees(n1); + bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - bch2_btree_node_write(c, n1, SIX_LOCK_intent); + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1429,7 +1588,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err; } else if (n3) { bch2_btree_set_root(as, trans, path, n3); } else { @@ -1437,20 +1598,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_set_root(as, trans, path, n1); } - bch2_btree_update_get_open_buckets(as, n1); - if (n2) - bch2_btree_update_get_open_buckets(as, n2); - if (n3) + if (n3) { bch2_btree_update_get_open_buckets(as, n3); - - /* Successful split, update the path to point to the new nodes: */ - - six_lock_increment(&b->c.lock, SIX_LOCK_intent); - if (n3) - bch2_trans_node_add(trans, n3); - if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); + bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + if (n2) { + bch2_btree_update_get_open_buckets(as, n2); + bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + } + bch2_btree_update_get_open_buckets(as, n1); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); /* * The old node must be freed (in memory) _before_ unlocking the new @@ -1458,13 +1615,28 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, path, b); + + if (n3) + bch2_trans_node_add(trans, n3); + if (n2) + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); if (n3) six_unlock_intent(&n3->c.lock); if (n2) six_unlock_intent(&n2->c.lock); six_unlock_intent(&n1->c.lock); +out: + if (path2) { + __bch2_btree_path_unlock(trans, path2); + bch2_path_put(trans, path2, true); + } + if (path1) { + __bch2_btree_path_unlock(trans, path1); + bch2_path_put(trans, path1, true); + } bch2_trans_verify_locks(trans); @@ -1472,6 +1644,14 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, ? BCH_TIME_btree_node_split : BCH_TIME_btree_node_compact], start_time); + return ret; +err: + if (n3) + bch2_btree_node_free_never_used(as, trans, n3); + if (n2) + bch2_btree_node_free_never_used(as, trans, n2); + bch2_btree_node_free_never_used(as, trans, n1); + goto out; } static void @@ -1506,22 +1686,30 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic. */ -static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, - struct keylist *keys, unsigned flags) +static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + int ret; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); + BUG_ON(!btree_node_intent_locked(path, b->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - bch2_btree_node_lock_for_insert(trans, path, b); + if (!(local_clock() & 63)) + return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); + + ret = bch2_btree_node_lock_write(trans, path, &b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, b); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); @@ -1547,30 +1735,41 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans * bch2_btree_node_unlock_write(trans, path, b); btree_node_interior_verify(c, b); - return; + return 0; split: - btree_split(as, trans, path, b, keys, flags); + /* + * We could attempt to avoid the transaction restart, by calling + * bch2_btree_path_upgrade() and allocating more nodes: + */ + if (b->c.level >= as->update_level) + return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); + + return btree_split(as, trans, path, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, struct btree_path *path, unsigned flags) { - struct bch_fs *c = trans->c; struct btree *b = path_l(path)->b; struct btree_update *as; unsigned l; int ret = 0; as = bch2_btree_update_start(trans, path, path->level, - btree_update_reserve_required(c, b), flags); + true, flags); if (IS_ERR(as)) return PTR_ERR(as); - btree_split(as, trans, path, b, NULL, flags); - bch2_btree_update_done(as); + ret = btree_split(as, trans, path, b, NULL, flags); + if (ret) { + bch2_btree_update_free(as, trans); + return ret; + } + + bch2_btree_update_done(as, trans); - for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) + for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; @@ -1583,7 +1782,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL; + struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1615,7 +1814,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; - sib_path->should_be_locked = true; + btree_path_set_should_be_locked(sib_path); m = sib_path->l[level].b; @@ -1634,15 +1833,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, } if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { - char buf1[100], buf2[100]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); - bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); + bch2_bpos_to_text(&buf1, prev->data->max_key); + bch2_bpos_to_text(&buf2, next->data->min_key); bch_err(c, "btree topology error in btree merge:\n" " prev ends at %s\n" " next starts at %s", - buf1, buf2); + buf1.buf, buf2.buf); + printbuf_exit(&buf1); + printbuf_exit(&buf2); bch2_topology_error(c); ret = -EIO; goto err; @@ -1672,36 +1873,42 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, goto out; parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, - btree_update_reserve_required(c, parent) + 1, - flags| + as = bch2_btree_update_start(trans, path, level, false, BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + BTREE_INSERT_USE_RESERVE| + flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; - trace_btree_merge(c, b); + trace_and_count(c, btree_node_merge, c, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, b->c.level); - bch2_btree_update_add_new_node(as, n); + n = bch2_btree_node_alloc(as, trans, b->c.level); + + SET_BTREE_NODE_SEQ(n->data, + max(BTREE_NODE_SEQ(b->data), + BTREE_NODE_SEQ(m->data)) + 1); btree_set_min(n, prev->data->min_key); btree_set_max(n, next->data->max_key); - n->data->format = new_f; + n->data->format = new_f; btree_node_set_format(n, new_f); bch2_btree_sort_into(c, n, prev); bch2_btree_sort_into(c, n, next); bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - bch2_btree_node_write(c, n, SIX_LOCK_intent); + new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1710,32 +1917,38 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err_free_update; bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - six_lock_increment(&b->c.lock, SIX_LOCK_intent); - six_lock_increment(&m->c.lock, SIX_LOCK_intent); + bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, sib_path, m); bch2_trans_node_add(trans, n); bch2_trans_verify_paths(trans); - bch2_btree_node_free_inmem(trans, b); - bch2_btree_node_free_inmem(trans, m); - six_unlock_intent(&n->c.lock); - bch2_btree_update_done(as); + bch2_btree_update_done(as, trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); out: err: + if (new_path) + bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); return ret; +err_free_update: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; } /** @@ -1747,6 +1960,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; + struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; int ret; @@ -1755,47 +1969,54 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, parent = btree_node_parent(iter->path, b); as = bch2_btree_update_start(trans, iter->path, b->c.level, - (parent - ? btree_update_reserve_required(c, parent) - : 0) + 1, - flags); + false, flags); ret = PTR_ERR_OR_ZERO(as); - if (ret) { - trace_btree_gc_rewrite_node_fail(c, b); + if (ret) goto out; - } bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, b); - bch2_btree_update_add_new_node(as, n); + n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - trace_btree_gc_rewrite_node(c, b); + new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent); + trace_and_count(c, btree_node_rewrite, c, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, trans, iter->path, parent, - &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, + &as->parent_keys, flags); + if (ret) + goto err; } else { bch2_btree_set_root(as, trans, iter->path, n); } bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, iter->path, b); - six_lock_increment(&b->c.lock, SIX_LOCK_intent); bch2_trans_node_add(trans, n); - bch2_btree_node_free_inmem(trans, b); six_unlock_intent(&n->c.lock); - bch2_btree_update_done(as); + bch2_btree_update_done(as, trans); out: - bch2_btree_path_downgrade(iter->path); + if (new_path) + bch2_path_put(trans, new_path, true); + bch2_btree_path_downgrade(trans, iter->path); return ret; +err: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; } struct async_btree_rewrite { @@ -1825,7 +2046,7 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans, goto out; ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -out : +out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1847,7 +2068,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return; a = kmalloc(sizeof(*a), GFP_NOFS); @@ -1875,21 +2096,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter2 = { NULL }; struct btree *parent; - u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; int ret; if (!skip_triggers) { - ret = bch2_trans_mark_key(trans, - bkey_s_c_null, - bkey_i_to_s_c(new_key), - BTREE_TRIGGER_INSERT); + ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), 0); if (ret) return ret; - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(&b->key), - bkey_s_c_null, - BTREE_TRIGGER_OVERWRITE); + ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, + new_key, 0); if (ret) return ret; } @@ -1912,9 +2128,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(iter2.path->level != b->c.level); BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); - btree_node_unlock(iter2.path, iter2.path->level); - path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; - iter2.path->level++; + btree_path_set_level_up(trans, iter2.path); + + bch2_btree_path_check_sort(trans, iter2.path, 0); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); @@ -1923,12 +2139,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - trans->extra_journal_entries = (void *) &journal_entries[0]; - trans->extra_journal_entry_u64s = - journal_entry_set((void *) &journal_entries[0], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - new_key, new_key->k.u64s); + ret = darray_make_room(&trans->extra_journal_entries, + jset_u64s(new_key->k.u64s)); + if (ret) + return ret; + + journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + new_key, new_key->k.u64s); + trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, @@ -1936,11 +2156,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED); + JOURNAL_WATERMARK_reserved); if (ret) goto err; - bch2_btree_node_lock_write(trans, iter->path, b); + bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -1978,11 +2198,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite struct closure cl; int ret = 0; - if (!btree_node_intent_locked(path, b->c.level) && - !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) { - btree_trans_restart(trans); - return -EINTR; - } + ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); + if (ret) + return ret; closure_init_stack(&cl); @@ -1995,11 +2213,12 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite if (ret) { bch2_trans_unlock(trans); closure_sync(&cl); - if (!bch2_trans_relock(trans)) - return -EINTR; + ret = bch2_trans_relock(trans); + if (ret) + return ret; } - new_hash = bch2_btree_node_mem_alloc(c); + new_hash = bch2_btree_node_mem_alloc(c, false); } path->intent_ref++; @@ -2075,7 +2294,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, false); bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); @@ -2112,7 +2331,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - pr_buf(out, "%p m %u w %u r %u j %llu\n", + prt_printf(out, "%p m %u w %u r %u j %llu\n", as, as->mode, as->nodes_written, @@ -2121,19 +2340,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->btree_interior_update_lock); } -size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) +static bool bch2_btree_interior_updates_pending(struct bch_fs *c) { - size_t ret = 0; - struct list_head *i; + bool ret; mutex_lock(&c->btree_interior_update_lock); - list_for_each(i, &c->btree_interior_update_list) - ret++; + ret = !list_empty(&c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); return ret; } +bool bch2_btree_interior_updates_flush(struct bch_fs *c) +{ + bool ret = bch2_btree_interior_updates_pending(c); + + if (ret) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_pending(c)); + return ret; +} + void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) { struct btree_root *r; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 8dc86fa..dabe815 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -52,6 +52,7 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; + unsigned update_level; struct disk_reservation disk_res; struct journal_preres journal_preres; @@ -76,8 +77,10 @@ struct btree_update { struct journal_entry_pin journal; /* Preallocated nodes we reserve when we start the update: */ - struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_prealloc_nodes; + struct prealloc_nodes { + struct btree *b[BTREE_UPDATE_NODES_MAX]; + unsigned nr; + } prealloc_nodes[2]; /* Nodes being freed: */ struct keylist old_keys; @@ -115,6 +118,7 @@ struct btree_update { }; struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree_trans *, struct btree *, struct bkey_format); @@ -307,7 +311,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); -size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); +bool bch2_btree_interior_updates_flush(struct bch_fs *); void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 4b37a48..3a68382 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -10,6 +10,7 @@ #include "btree_locking.h" #include "buckets.h" #include "debug.h" +#include "errcode.h" #include "error.h" #include "extent_update.h" #include "journal.h" @@ -31,6 +32,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->cached, r->cached) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->k->k.p, r->k->k.p); } @@ -54,9 +56,9 @@ static inline bool same_leaf_as_next(struct btree_trans *trans, insert_l(&i[0])->b == insert_l(&i[1])->b; } -static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { struct bch_fs *c = trans->c; @@ -75,14 +77,6 @@ static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, bch2_btree_init_next(trans, b); } -void bch2_btree_node_lock_for_insert(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - bch2_btree_node_lock_write(trans, path, b); - bch2_btree_node_prep_for_write(trans, path, b); -} - /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -167,11 +161,30 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans trans; + unsigned long old, new, v; + unsigned idx = w - b->writes; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + v = READ_ONCE(b->flags); + + do { + old = new = v; - six_lock_read(&b->c.lock, NULL, NULL); - bch2_btree_node_write_cond(c, b, - (btree_current_write(b) == w && w->journal.seq == seq)); + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); return 0; } @@ -199,7 +212,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, /** * btree_insert_key - insert a key one key into a leaf node */ -static bool btree_insert_key_leaf(struct btree_trans *trans, +static void btree_insert_key_leaf(struct btree_trans *trans, struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; @@ -212,7 +225,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, &insert_l(insert)->iter, insert->k))) - return false; + return; i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, le64_to_cpu(i->journal_seq))); @@ -220,7 +233,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); if (unlikely(!btree_node_dirty(b))) - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -233,8 +246,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); - - return true; } /* Cached btree updates: */ @@ -269,9 +280,10 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, if (ret) return ret; - if (!bch2_trans_relock(trans)) { - trace_trans_restart_journal_preres_get(trans->fn, trace_ip); - return -EINTR; + ret = bch2_trans_relock(trans); + if (ret) { + trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0); + return ret; } return 0; @@ -283,39 +295,28 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; - if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - flags |= JOURNAL_RES_GET_RESERVED; - ret = bch2_journal_res_get(&c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, + flags| + (trans->flags & JOURNAL_WATERMARK_MASK)); return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } #define JSET_ENTRY_LOG_U64s 4 -static noinline void journal_transaction_name(struct btree_trans *trans) +static void journal_transaction_name(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res); - struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - unsigned u64s = JSET_ENTRY_LOG_U64s - 1; - unsigned b, buflen = u64s * sizeof(u64); - - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 0; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - b = min_t(unsigned, strlen(trans->fn), buflen); - memcpy(l->d, trans->fn, b); - while (b < buflen) - l->d[b++] = '\0'; - - trans->journal_res.offset += JSET_ENTRY_LOG_U64s; - trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s; + struct journal *j = &c->journal; + struct jset_entry *entry = + bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_log, 0, 0, + JSET_ENTRY_LOG_U64s); + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + + strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); } static inline enum btree_insert_ret @@ -367,39 +368,162 @@ btree_key_can_insert_cached(struct btree_trans *trans, ck->u64s = new_u64s; ck->k = new_k; - return BTREE_INSERT_OK; + return 0; } -static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_insert_entry *i) +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) { - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - bool did_work; + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + int ret; - EBUG_ON(trans->journal_res.ref != - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; - i->k->k.needs_whiteout = false; + if (!btree_node_type_needs_gc(i->btree_id)) + return 0; - did_work = !i->cached - ? btree_insert_key_leaf(trans, i) - : bch2_btree_insert_key_cached(trans, i->path, i->k); - if (!did_work) - return; + if (bch2_bkey_ops[old.k->type].atomic_trigger == + bch2_bkey_ops[i->k->k.type].atomic_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - bch2_journal_add_keys(j, &trans->journal_res, - i->btree_id, - i->level, - i->k); + _deleted.p = i->path->pos; - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; + ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + bch2_bkey_ops[old.k->type].trans_trigger == + bch2_bkey_ops[i->k->k.type].trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + } else if (!overwrite && !i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); + + return 0; } -static noinline int bch2_trans_mark_gc(struct btree_trans *trans) +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; @@ -413,8 +537,7 @@ static noinline int bch2_trans_mark_gc(struct btree_trans *trans) BUG_ON(i->cached || i->level); if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { - ret = bch2_mark_update(trans, i->path, i->k, - i->flags|BTREE_TRIGGER_GC); + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) break; } @@ -436,9 +559,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, int ret; if (race_fault()) { - trace_trans_restart_fault_inject(trans->fn, trace_ip); - trans->restarted = true; - return -EINTR; + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); } /* @@ -473,6 +595,33 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (btree_node_type_needs_gc(i->bkey_type)) marking = true; + + /* + * Revalidate before calling mem triggers - XXX, ugly: + * + * - successful btree node splits don't cause transaction + * restarts and will have invalidated the pointer to the bkey + * value + * - btree_node_lock_for_insert() -> btree_node_prep_for_write() + * when it has to resort + * - btree_key_can_insert_cached() when it has to reallocate + * + * Ugly because we currently have no way to tell if the + * pointer's been invalidated, which means it's debatabale + * whether we should be stashing the old key at all. + */ + i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, + i->k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } } /* @@ -485,19 +634,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (ret) return ret; - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } - if (unlikely(trans->extra_journal_entry_u64s)) { + if (unlikely(trans->extra_journal_entries.nr)) { memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries, - trans->extra_journal_entry_u64s); + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); - trans->journal_res.offset += trans->extra_journal_entry_u64s; - trans->journal_res.u64s -= trans->extra_journal_entry_u64s; + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; } /* @@ -520,110 +668,71 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = bch2_mark_update(trans, i->path, i->k, i->flags); + ret = run_one_mem_trigger(trans, i, i->flags); if (ret) return ret; } if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_mark_gc(trans); + ret = bch2_trans_commit_run_gc_triggers(trans); if (ret) return ret; } - trans_for_each_update(trans, i) - do_btree_insert_one(trans, i); - - return ret; -} - -static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) -{ - unsigned l; - - for (l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_read_locked(path, l)) - BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); -} - -static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) -{ - struct btree *b = path_l(path)->b; - - do { - if (path->nodes_locked && - path->nodes_locked != path->nodes_intent_locked) - path_upgrade_readers(trans, path); - } while ((path = prev_btree_path(trans, path)) && - path_l(path)->b == b); -} - -/* - * Check for nodes that we have both read and intent locks on, and upgrade the - * readers to intent: - */ -static inline void normalize_read_intent_locks(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i, nr_read = 0, nr_intent = 0; - - trans_for_each_path_inorder(trans, path, i) { - struct btree_path *next = i + 1 < trans->nr_sorted - ? trans->paths + trans->sorted[i + 1] - : NULL; - - if (path->nodes_locked) { - if (path->nodes_intent_locked) - nr_intent++; - else - nr_read++; + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + trans_for_each_update(trans, i) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + if (i->key_cache_already_flushed) + continue; + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble(&entry->start[0], + (struct bkey_s_c) { &i->old_k, i->old_v }); + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); + bkey_copy(&entry->start[0], i->k); } - if (!next || path_l(path)->b != path_l(next)->b) { - if (nr_read && nr_intent) - upgrade_readers(trans, path); - - nr_read = nr_intent = 0; - } + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; } - bch2_trans_verify_locks(trans); -} - -static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_inorder(trans, path, i) { - //if (path == pos) - // break; - - if (path->nodes_locked != path->nodes_intent_locked && - !bch2_btree_path_upgrade(trans, path, path->level + 1)) - return true; + trans_for_each_update(trans, i) { + i->k->k.needs_whiteout = false; + + if (!i->cached) + btree_insert_key_leaf(trans, i); + else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, i->path, i->k); + else { + bch2_btree_key_cache_drop(trans, i->path); + btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + } } - return false; + return ret; } static inline int trans_lock_write(struct btree_trans *trans) { struct btree_insert_entry *i; + int ret; trans_for_each_update(trans, i) { if (same_leaf_as_prev(trans, i)) continue; - if (!six_trylock_write(&insert_l(i)->b->c.lock)) { - if (have_conflicting_read_lock(trans, i->path)) - goto fail; - - btree_node_lock_type(trans, i->path, - insert_l(i)->b, - i->path->pos, i->level, - SIX_LOCK_write, NULL, NULL); - } + ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c); + if (ret) + goto fail; bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } @@ -637,8 +746,8 @@ fail: bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); } - trace_trans_restart_would_deadlock_write(trans->fn); - return btree_trans_restart(trans); + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) @@ -658,40 +767,40 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct bkey_s_c old; + struct printbuf buf = PRINTBUF; int ret, u64s_delta = 0; + int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; trans_for_each_update(trans, i) { - const char *invalid = bch2_bkey_invalid(c, - bkey_i_to_s_c(i->k), i->bkey_type); - if (invalid) { - char buf[200]; - - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", - buf, trans->fn, (void *) i->ip_allocated, invalid); + if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + prt_newline(&buf); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, &buf); + + bch2_trans_inconsistent(trans, "%s", buf.buf); + printbuf_exit(&buf); return -EINVAL; } btree_insert_entry_checks(trans, i); } - trans_for_each_update(trans, i) { - struct bkey u; + printbuf_exit(&buf); - /* - * peek_slot() doesn't yet work on iterators that point to - * interior nodes: - */ - if (i->cached || i->level) + trans_for_each_update(trans, i) { + if (i->cached) continue; - old = bch2_btree_path_peek_slot(i->path, &u); - ret = bkey_err(old); - if (unlikely(ret)) - return ret; - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + u64s_delta -= i->old_btree_u64s; if (!same_leaf_as_next(trans, i)) { if (u64s_delta <= 0) { @@ -708,16 +817,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| - ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - ? JOURNAL_RES_GET_RESERVED : 0)); + (trans->flags & JOURNAL_WATERMARK_MASK)); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s, trace_ip); if (unlikely(ret)) return ret; - normalize_read_intent_locks(trans); - ret = trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -770,12 +876,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, switch (ret) { case BTREE_INSERT_BTREE_NODE_FULL: ret = bch2_btree_split_leaf(trans, i->path, trans->flags); - if (!ret) - return 0; - - if (ret == -EINTR) - trace_trans_restart_btree_node_split(trans->fn, trace_ip, - i->btree_id, &i->path->pos); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); break; case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); @@ -784,19 +886,16 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (ret) break; - if (bch2_trans_relock(trans)) - return 0; - - trace_trans_restart_mark_replicas(trans->fn, trace_ip); - ret = -EINTR; + ret = bch2_trans_relock(trans); + if (ret) + trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip); break; case BTREE_INSERT_NEED_JOURNAL_RES: bch2_trans_unlock(trans); if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { - trans->restarted = true; - ret = -EAGAIN; + !(trans->flags & JOURNAL_WATERMARK_reserved)) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } @@ -804,37 +903,35 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (ret) break; - if (bch2_trans_relock(trans)) - return 0; - - trace_trans_restart_journal_res_get(trans->fn, trace_ip); - ret = -EINTR; + ret = bch2_trans_relock(trans); + if (ret) + trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip); break; case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); if (ret < 0) break; - if (bch2_trans_relock(trans)) - return 0; - - trace_trans_restart_journal_reclaim(trans->fn, trace_ip); - ret = -EINTR; + ret = bch2_trans_relock(trans); + if (ret) + trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip); break; default: BUG_ON(ret >= 0); break; } - BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); - BUG_ON(ret == -ENOSPC && - !(trans->flags & BTREE_INSERT_NOWAIT) && - (trans->flags & BTREE_INSERT_NOFAIL)); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + !(trans->flags & BTREE_INSERT_NOWAIT) && + (trans->flags & BTREE_INSERT_NOFAIL), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; } @@ -851,126 +948,34 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) bch2_trans_unlock(trans); - ret = bch2_fs_read_write_early(c); + ret = bch2_fs_read_write_early(c) ?: + bch2_trans_relock(trans); if (ret) return ret; - if (!bch2_trans_relock(trans)) - return -EINTR; - percpu_ref_get(&c->writes); return 0; } -static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) -{ - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - struct bkey unpacked; - int ret = 0; - - if ((i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; - - if (!overwrite) { - if (i->insert_trigger_run) - return 0; - - BUG_ON(i->overwrite_trigger_run); - i->insert_trigger_run = true; - } else { - if (i->overwrite_trigger_run) - return 0; - - BUG_ON(!i->insert_trigger_run); - i->overwrite_trigger_run = true; - } - - old = bch2_btree_path_peek_slot(i->path, &unpacked); - _deleted.p = i->path->pos; - - if (overwrite) { - ret = bch2_trans_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|i->flags); - } else if (old.k->type == i->k->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - i->overwrite_trigger_run = true; - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); - } else { - ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|i->flags); - } - - if (ret == -EINTR) - trace_trans_restart_mark(trans->fn, _RET_IP_, - i->btree_id, &i->path->pos); - return ret ?: 1; -} - -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i; - bool trans_trigger_run; - int ret, overwrite; - - for (overwrite = 0; overwrite < 2; overwrite++) { - - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - ret = run_one_trigger(trans, i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - } - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; - unsigned btree_id = 0; int ret = 0; - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) - return ret; + break; } - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; + return ret; } int __bch2_trans_commit(struct btree_trans *trans) @@ -981,62 +986,59 @@ int __bch2_trans_commit(struct btree_trans *trans) int ret = 0; if (!trans->nr_updates && - !trans->extra_journal_entry_u64s) + !trans->extra_journal_entries.nr) goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - - trans->journal_u64s = trans->extra_journal_entry_u64s; - trans->journal_preres_u64s = 0; - - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; - if (trans->journal_transaction_names) - trans->journal_u64s += JSET_ENTRY_LOG_U64s; + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget(&c->writes))) { + unlikely(!percpu_ref_tryget_live(&c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) goto out_reset; } -#ifdef CONFIG_BCACHEFS_DEBUG - /* - * if BTREE_TRIGGER_NORUN is set, it means we're probably being called - * from the key cache flush code: - */ - trans_for_each_update(trans, i) - if (!i->cached && - !(i->flags & BTREE_TRIGGER_NORUN)) - bch2_btree_key_cache_verify_clean(trans, - i->btree_id, i->k->k.p); -#endif + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out; + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_preres_u64s = 0; + + /* For journalling transaction name: */ + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { BUG_ON(!i->path->should_be_locked); - if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { - trace_trans_restart_upgrade(trans->fn, _RET_IP_, - i->btree_id, &i->path->pos); - ret = btree_trans_restart(trans); + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) goto out; - } BUG_ON(!btree_node_intent_locked(i->path, i->level)); + if (i->key_cache_already_flushed) + continue; + + /* we're going to journal the key being updated: */ u64s = jset_u64s(i->k->k.u64s); if (i->cached && likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) trans->journal_preres_u64s += u64s; trans->journal_u64s += u64s; + + /* and we're also going to log the overwrite: */ + trans->journal_u64s += jset_u64s(i->old_k.u64s); } if (trans->extra_journal_res) { @@ -1058,24 +1060,20 @@ retry: if (ret) goto err; + + trace_and_count(c, transaction_commit, trans, _RET_IP_); out: bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&c->writes); out_reset: - trans_for_each_update(trans, i) - bch2_path_put(trans, i->path, true); - - trans->extra_journal_res = 0; - trans->nr_updates = 0; - trans->hooks = NULL; - trans->extra_journal_entries = NULL; - trans->extra_journal_entry_u64s = 0; + bch2_trans_reset_updates(trans); if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; - memset(&trans->fs_usage_deltas->memset_start, 0, + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, (void *) &trans->fs_usage_deltas->memset_end - (void *) &trans->fs_usage_deltas->memset_start); } @@ -1089,7 +1087,7 @@ err: goto retry; } -static int check_pos_snapshot_overwritten(struct btree_trans *trans, +static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, enum btree_id id, struct bpos pos) { @@ -1098,12 +1096,6 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, struct bkey_s_c k; int ret; - if (!btree_type_has_snapshots(id)) - return 0; - - if (!snapshot_t(c, pos.snapshot)->children[0]) - return 0; - bch2_trans_iter_init(trans, &iter, id, pos, BTREE_ITER_NOT_EXTENTS| BTREE_ITER_ALL_SNAPSHOTS); @@ -1129,6 +1121,18 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, return ret; } +static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + pos.snapshot == U32_MAX || + !snapshot_t(trans->c, pos.snapshot)->children[0]) + return 0; + + return __check_pos_snapshot_overwritten(trans, id, pos); +} + int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, @@ -1146,7 +1150,7 @@ int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_INTENT| BTREE_ITER_WITH_UPDATES| BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -1298,7 +1302,8 @@ nomerge1: goto out; } next: - k = bch2_btree_iter_next(&iter); + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -1376,9 +1381,42 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, } static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags) +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip); + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_path *path, + struct btree_insert_entry *i, + enum btree_update_flags flags, + unsigned long ip) { + struct btree_path *btree_path; + int ret; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; + + btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, btree_path, 0); + if (ret) + goto err; + + btree_path_set_should_be_locked(btree_path); + ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip); +err: + bch2_path_put(trans, btree_path, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; BUG_ON(!path->should_be_locked); @@ -1394,7 +1432,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, .cached = path->cached, .path = path, .k = k, - .ip_allocated = _RET_IP_, + .ip_allocated = ip, }; #ifdef CONFIG_BCACHEFS_DEBUG @@ -1416,15 +1454,51 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); bch2_path_put(trans, i->path, true); - *i = n; - } else + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->ip_allocated = n.ip_allocated; + } else { array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - __btree_path_get(n.path, true); + i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(i->path, true); + + /* + * If a key is present in the key cache, it must also exist in the + * btree - this is necessary for cache coherency. When iterating over + * a btree that's cached in the key cache, the btree iter code checks + * the key cache - but the key has to exist in the btree for that to + * work: + */ + if (unlikely(path->cached && bkey_deleted(&i->old_k))) + return flush_new_cached_update(trans, path, i, flags, ip); + return 0; } +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) +{ + return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_); +} + int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { @@ -1446,6 +1520,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter k->k.type = KEY_TYPE_whiteout; } + /* + * Ensure that updates to cached btrees go to the key cache: + */ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && @@ -1465,20 +1542,18 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter _THIS_IP_); ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL); + BTREE_ITER_CACHED); if (unlikely(ret)) return ret; ck = (void *) iter->key_cache_path->l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); - btree_trans_restart(trans); - return -EINTR; + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - iter->key_cache_path->should_be_locked = true; + btree_path_set_should_be_locked(iter->key_cache_path); } path = iter->key_cache_path; @@ -1524,8 +1599,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, __bch2_btree_insert(&trans, id, k)); } -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) +int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, + unsigned len, unsigned update_flags) { struct bkey_i *k; @@ -1535,28 +1610,39 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k->k); k->k.p = iter->pos; + bch2_key_resize(&k->k, len); return bch2_trans_update(trans, iter, k, update_flags); } +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) +{ + return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +} + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, - unsigned iter_flags, + unsigned update_flags, u64 *journal_seq) { + u32 restart_count = trans->restart_count; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); -retry: - while ((bch2_trans_begin(trans), - (k = bch2_btree_iter_peek(&iter)).k) && - !(ret = bkey_err(k)) && - bkey_cmp(iter.pos, end) < 0) { + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + while ((k = bch2_btree_iter_peek(&iter)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto err; + + if (bkey_cmp(iter.pos, end) >= 0) + break; + bkey_init(&delete.k); /* @@ -1585,23 +1671,31 @@ retry: ret = bch2_extent_trim_atomic(trans, &iter, &delete); if (ret) - break; + goto err; } - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; if (ret) break; } - - if (ret == -EINTR) { - ret = 0; - goto retry; - } - bch2_trans_iter_exit(trans, &iter); + + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; return ret; } @@ -1612,10 +1706,40 @@ retry: */ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, - unsigned iter_flags, + unsigned update_flags, u64 *journal_seq) { - return bch2_trans_do(c, NULL, journal_seq, 0, - bch2_btree_delete_range_trans(&trans, id, start, end, - iter_flags, journal_seq)); + int ret = bch2_trans_run(c, + bch2_btree_delete_range_trans(&trans, id, start, end, + update_flags, journal_seq)); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +} + +int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) +{ + unsigned len = strlen(msg); + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); + struct jset_entry_log *l; + int ret; + + ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s)); + if (ret) + return ret; + + l = (void *) &darray_top(trans->extra_journal_entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, msg, len); + while (len & 7) + l->d[len++] = '\0'; + + trans->extra_journal_entries.nr += jset_u64s(u64s); + return 0; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index eb0eaa9..116711f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -7,6 +7,7 @@ #include "bcachefs.h" #include "alloc_background.h" +#include "backpointers.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -88,20 +89,17 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, : ca->usage[journal_seq & JOURNAL_BUF_MASK]); } -struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) +void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { struct bch_fs *c = ca->fs; - struct bch_dev_usage ret; unsigned seq, i, u64s = dev_usage_u64s(); do { seq = read_seqcount_begin(&c->usage_lock); - memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + memcpy(usage, ca->usage_base, u64s * sizeof(u64)); for (i = 0; i < ARRAY_SIZE(ca->usage); i++) - acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); - - return ret; } static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, @@ -197,26 +195,26 @@ void bch2_fs_usage_to_text(struct printbuf *out, { unsigned i; - pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); + prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); - pr_buf(out, "hidden:\t\t\t\t%llu\n", + prt_printf(out, "hidden:\t\t\t\t%llu\n", fs_usage->u.hidden); - pr_buf(out, "data:\t\t\t\t%llu\n", + prt_printf(out, "data:\t\t\t\t%llu\n", fs_usage->u.data); - pr_buf(out, "cached:\t\t\t\t%llu\n", + prt_printf(out, "cached:\t\t\t\t%llu\n", fs_usage->u.cached); - pr_buf(out, "reserved:\t\t\t%llu\n", + prt_printf(out, "reserved:\t\t\t%llu\n", fs_usage->u.reserved); - pr_buf(out, "nr_inodes:\t\t\t%llu\n", + prt_printf(out, "nr_inodes:\t\t\t%llu\n", fs_usage->u.nr_inodes); - pr_buf(out, "online reserved:\t\t%llu\n", + prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); for (i = 0; i < ARRAY_SIZE(fs_usage->u.persistent_reserved); i++) { - pr_buf(out, "%u replicas:\n", i + 1); - pr_buf(out, "\treserved:\t\t%llu\n", + prt_printf(out, "%u replicas:\n", i + 1); + prt_printf(out, "\treserved:\t\t%llu\n", fs_usage->u.persistent_reserved[i]); } @@ -224,9 +222,9 @@ void bch2_fs_usage_to_text(struct printbuf *out, struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - pr_buf(out, "\t"); + prt_printf(out, "\t"); bch2_replicas_entry_to_text(out, e); - pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]); + prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); } } @@ -279,44 +277,22 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -static inline int is_unavailable_bucket(struct bucket_mark m) +void bch2_dev_usage_init(struct bch_dev *ca) { - return !is_available_bucket(m); + ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; } static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bucket_mark m) + struct bch_alloc_v4 a) { - return m.dirty_sectors - ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) : 0; } -static inline int is_stripe_data_bucket(struct bucket_mark m) -{ - return m.stripe && m.data_type != BCH_DATA_parity; -} - -static inline enum bch_data_type bucket_type(struct bucket_mark m) -{ - return m.cached_sectors && !m.dirty_sectors - ? BCH_DATA_cached - : m.data_type; -} - -static inline void account_bucket(struct bch_fs_usage *fs_usage, - struct bch_dev_usage *dev_usage, - enum bch_data_type type, - int nr, s64 size) -{ - if (type == BCH_DATA_sb || type == BCH_DATA_journal) - fs_usage->hidden += size; - - dev_usage->d[type].buckets += nr; -} - static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new, + struct bch_alloc_v4 old, + struct bch_alloc_v4 new, u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; @@ -324,32 +300,52 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - u = dev_usage_ptr(ca, journal_seq, gc); - if (bucket_type(old)) - account_bucket(fs_usage, u, bucket_type(old), - -1, -ca->mi.bucket_size); + if (data_type_is_hidden(old.data_type)) + fs_usage->hidden -= ca->mi.bucket_size; + if (data_type_is_hidden(new.data_type)) + fs_usage->hidden += ca->mi.bucket_size; - if (bucket_type(new)) - account_bucket(fs_usage, u, bucket_type(new), - 1, ca->mi.bucket_size); + u = dev_usage_ptr(ca, journal_seq, gc); - u->buckets_ec += (int) new.stripe - (int) old.stripe; - u->buckets_unavailable += - is_unavailable_bucket(new) - is_unavailable_bucket(old); + u->d[old.data_type].buckets--; + u->d[new.data_type].buckets++; + + u->buckets_ec -= (int) !!old.stripe; + u->buckets_ec += (int) !!new.stripe; u->d[old.data_type].sectors -= old.dirty_sectors; u->d[new.data_type].sectors += new.dirty_sectors; - u->d[BCH_DATA_cached].sectors += - (int) new.cached_sectors - (int) old.cached_sectors; + + u->d[BCH_DATA_cached].sectors += new.cached_sectors; + u->d[BCH_DATA_cached].sectors -= old.cached_sectors; u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); preempt_enable(); +} + +static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) +{ + struct bch_alloc_v4 old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; + struct bch_alloc_v4 new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, + .cached_sectors = new.cached_sectors, + .stripe = new.stripe, + }; - if (!is_available_bucket(old) && is_available_bucket(new)) - bch2_wake_allocator(ca); + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); } static inline int __update_replicas(struct bch_fs *c, @@ -373,22 +369,22 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, { struct bch_fs_usage __percpu *fs_usage; int idx, ret = 0; - char buf[200]; + struct printbuf buf = PRINTBUF; percpu_down_read(&c->mark_lock); + buf.atomic++; idx = bch2_replicas_entry_idx(c, r); if (idx < 0 && - (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err(c, "no replicas entry\n" - " while marking %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { + fsck_err(c, "no replicas entry\n" + " while marking %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { percpu_up_read(&c->mark_lock); ret = bch2_mark_replicas(c, r); - if (ret) - return ret; - percpu_down_read(&c->mark_lock); + + if (ret) + goto err; idx = bch2_replicas_entry_idx(c, r); } if (idx < 0) { @@ -404,6 +400,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, err: fsck_err: percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } @@ -467,7 +464,8 @@ static inline void update_replicas_list(struct btree_trans *trans, n = (void *) d->d + d->used; n->delta = sectors; - memcpy(&n->r, r, replicas_entry_bytes(r)); + memcpy((void *) n + offsetof(struct replicas_delta, r), + r, replicas_entry_bytes(r)); bch2_replicas_entry_sort(&n->r); d->used += b; } @@ -482,31 +480,15 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator) -{ - struct bucket *g = bucket(ca, b); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - new.owned_by_allocator = owned_by_allocator; - })); - - BUG_ON(owned_by_allocator == old.owned_by_allocator); -} - -static int bch2_mark_alloc(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); - struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); + struct bch_alloc_v4 old_a, new_a; struct bch_dev *ca; - struct bucket *g; - struct bucket_mark old_m, m; int ret = 0; /* @@ -516,11 +498,20 @@ static int bch2_mark_alloc(struct btree_trans *trans, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; + + ca = bch_dev_bkey_exists(c, new.k->p.inode); + + bch2_alloc_to_v4(old, &old_a); + bch2_alloc_to_v4(new, &new_a); + if ((flags & BTREE_TRIGGER_INSERT) && - !old_u.data_type != !new_u.data_type && - new.k->type == KEY_TYPE_alloc_v3) { - struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; - u64 old_journal_seq = le64_to_cpu(v->journal_seq); + data_type_is_empty(old_a.data_type) != + data_type_is_empty(new_a.data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; BUG_ON(!journal_seq); @@ -529,18 +520,20 @@ static int bch2_mark_alloc(struct btree_trans *trans, * before the bucket became empty again, then the we don't have * to wait on a journal flush before we can reuse the bucket: */ - new_u.journal_seq = !new_u.data_type && - (journal_seq == old_journal_seq || - bch2_journal_noflush_seq(&c->journal, old_journal_seq)) + new_a.journal_seq = data_type_is_empty(new_a.data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ? 0 : journal_seq; - v->journal_seq = cpu_to_le64(new_u.journal_seq); + v->journal_seq = new_a.journal_seq; } - if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { + if (!data_type_is_empty(old_a.data_type) && + data_type_is_empty(new_a.data_type) && + new_a.journal_seq) { ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, - new_u.dev, new_u.bucket, - new_u.journal_seq); + new.k->p.inode, new.k->p.offset, + new_a.journal_seq); if (ret) { bch2_fs_fatal_error(c, "error setting bucket_needs_journal_commit: %i", ret); @@ -548,33 +541,27 @@ static int bch2_mark_alloc(struct btree_trans *trans, } } - ca = bch_dev_bkey_exists(c, new_u.dev); + percpu_down_read(&c->mark_lock); + if (!gc && new_a.gen != old_a.gen) + *bucket_gen(ca, new.k->p.offset) = new_a.gen; - if (new_u.bucket >= ca->mi.nbuckets) - return 0; + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); - percpu_down_read(&c->mark_lock); - if (!gc && new_u.gen != old_u.gen) - *bucket_gen(ca, new_u.bucket) = new_u.gen; - - g = __bucket(ca, new_u.bucket, gc); - - old_m = bucket_cmpxchg(g, m, ({ - m.gen = new_u.gen; - m.data_type = new_u.data_type; - m.dirty_sectors = new_u.dirty_sectors; - m.cached_sectors = new_u.cached_sectors; - m.stripe = new_u.stripe != 0; - })); - - bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); - - g->io_time[READ] = new_u.read_time; - g->io_time[WRITE] = new_u.write_time; - g->oldest_gen = new_u.oldest_gen; - g->gen_valid = 1; - g->stripe = new_u.stripe; - g->stripe_redundancy = new_u.stripe_redundancy; + if (gc) { + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a.gen; + g->data_type = new_a.data_type; + g->stripe = new_a.stripe; + g->stripe_redundancy = new_a.stripe_redundancy; + g->dirty_sectors = new_a.dirty_sectors; + g->cached_sectors = new_a.cached_sectors; + + bucket_unlock(g); + } percpu_up_read(&c->mark_lock); /* @@ -583,40 +570,42 @@ static int bch2_mark_alloc(struct btree_trans *trans, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_m.cached_sectors) { + old_a.cached_sectors) { ret = update_cached_sectors(c, new, ca->dev_idx, - -old_m.cached_sectors, + -((s64) old_a.cached_sectors), journal_seq, gc); if (ret) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); return ret; } - - trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), - old_m.cached_sectors); } + if (new_a.data_type == BCH_DATA_free && + (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a.data_type == BCH_DATA_need_discard && + (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a.data_type != BCH_DATA_cached && + new_a.data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a.data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + return 0; } -#define checked_add(a, b) \ -({ \ - unsigned _res = (unsigned) (a) + (b); \ - bool overflow = _res > U16_MAX; \ - if (overflow) \ - _res = U16_MAX; \ - (a) = _res; \ - overflow; \ -}) - -void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, struct gc_pos pos, - unsigned flags) +int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) { - struct bucket *g; - struct bucket_mark old, new; - bool overflow; + struct bucket old, new, *g; + int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); BUG_ON(data_type != BCH_DATA_sb && @@ -626,40 +615,42 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, * Backup superblock might be past the end of our normal usable space: */ if (b >= ca->mi.nbuckets) - return; + return 0; percpu_down_read(&c->mark_lock); g = gc_bucket(ca, b); - old = bucket_cmpxchg(g, new, ({ - new.data_type = data_type; - overflow = checked_add(new.dirty_sectors, sectors); - })); - - bch2_fs_inconsistent_on(old.data_type && - old.data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_types[old.data_type], - bch2_data_types[data_type]); - - bch2_fs_inconsistent_on(overflow, c, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", - ca->dev_idx, b, new.gen, - bch2_data_types[old.data_type ?: data_type], - old.dirty_sectors, sectors); - - bch2_dev_usage_update(c, ca, old, new, 0, true); - percpu_up_read(&c->mark_lock); -} -static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -{ - EBUG_ON(sectors < 0); + bucket_lock(g); + old = *g; + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[g->data_type], + bch2_data_types[data_type])) { + ret = -EIO; + goto err; + } + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_types[g->data_type ?: data_type], + g->dirty_sectors, sectors)) { + ret = -EIO; + goto err; + } + - return p.crc.compression_type && - p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible - ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, - p.crc.uncompressed_size) - : sectors; + g->data_type = data_type; + g->dirty_sectors += sectors; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); + return ret; } static int check_bucket_ref(struct bch_fs *c, @@ -667,14 +658,22 @@ static int check_bucket_ref(struct bch_fs *c, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 b_gen, u8 bucket_data_type, - u16 dirty_sectors, u16 cached_sectors) + u32 dirty_sectors, u32 cached_sectors) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); u16 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - char buf[200]; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (bucket_data_type == BCH_DATA_cached) + bucket_data_type = BCH_DATA_user; + + if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || + (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) + bucket_data_type = ptr_data_type = BCH_DATA_stripe; if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, @@ -683,8 +682,9 @@ static int check_bucket_ref(struct bch_fs *c, ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { @@ -694,8 +694,10 @@ static int check_bucket_ref(struct bch_fs *c, ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } if (b_gen != ptr->gen && !ptr->cached) { @@ -706,14 +708,19 @@ static int check_bucket_ref(struct bch_fs *c, *bucket_gen(ca, bucket_nr), bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if (b_gen != ptr->gen) - return 1; + if (b_gen != ptr->gen) { + ret = 1; + goto err; + } - if (bucket_data_type && ptr_data_type && + if (!data_type_is_empty(bucket_data_type) && + ptr_data_type && bucket_data_type != ptr_data_type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" @@ -721,22 +728,27 @@ static int check_bucket_ref(struct bch_fs *c, ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type], bch2_data_types[ptr_data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { + if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], bucket_sectors, sectors, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - - return 0; +err: + printbuf_exit(&buf); + return ret; } static int mark_stripe_bucket(struct btree_trans *trans, @@ -753,9 +765,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g; - struct bucket_mark new, old; - char buf[200]; + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -763,40 +774,42 @@ static int mark_stripe_bucket(struct btree_trans *trans, /* * XXX doesn't handle deletion */ percpu_down_read(&c->mark_lock); + buf.atomic++; g = PTR_GC_BUCKET(ca, ptr); - if (g->mark.dirty_sectors || + if (g->dirty_sectors || (g->stripe && g->stripe != k.k->p.offset)) { bch2_fs_inconsistent(c, "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EINVAL; goto err; } - old = bucket_cmpxchg(g, new, ({ - ret = check_bucket_ref(c, k, ptr, sectors, data_type, - new.gen, new.data_type, - new.dirty_sectors, new.cached_sectors); - if (ret) - goto err; + bucket_lock(g); + old = *g; - new.dirty_sectors += sectors; - if (data_type) - new.data_type = data_type; + ret = check_bucket_ref(c, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors, g->cached_sectors); + if (ret) + goto err; - new.stripe = true; - })); + if (data_type) + g->data_type = data_type; + g->dirty_sectors += sectors; g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + new = *g; err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); percpu_up_read(&c->mark_lock); - - return 0; + printbuf_exit(&buf); + return ret; } static int __mark_pointer(struct btree_trans *trans, @@ -804,9 +817,9 @@ static int __mark_pointer(struct btree_trans *trans, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 *bucket_data_type, - u16 *dirty_sectors, u16 *cached_sectors) + u32 *dirty_sectors, u32 *cached_sectors) { - u16 *dst_sectors = !ptr->cached + u32 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, @@ -830,43 +843,31 @@ static int bch2_mark_pointer(struct btree_trans *trans, { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g; + struct bucket old, new, *g; u8 bucket_data_type; - u64 v; int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); percpu_down_read(&c->mark_lock); g = PTR_GC_BUCKET(ca, &p.ptr); - - v = atomic64_read(&g->_mark.v); - do { - new.v.counter = old.v.counter = v; - bucket_data_type = new.data_type; - - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, new.gen, - &bucket_data_type, - &new.dirty_sectors, - &new.cached_sectors); - if (ret) - goto err; - - new.data_type = bucket_data_type; - - if (flags & BTREE_TRIGGER_NOATOMIC) { - g->_mark = new; - break; - } - } while ((v = atomic64_cmpxchg(&g->_mark.v, - old.v.counter, - new.v.counter)) != old.v.counter); - - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); -err: + bucket_lock(g); + old = *g; + + bucket_data_type = g->data_type; + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (!ret) + g->data_type = bucket_data_type; + + new = *g; + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); percpu_up_read(&c->mark_lock); return ret; @@ -913,13 +914,13 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, return 0; } -static int bch2_mark_extent(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_extent(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -983,10 +984,11 @@ static int bch2_mark_extent(struct btree_trans *trans, if (r.e.nr_devs) { ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); if (ret) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); return ret; } } @@ -994,9 +996,9 @@ static int bch2_mark_extent(struct btree_trans *trans, return 0; } -static int bch2_mark_stripe(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; @@ -1015,13 +1017,16 @@ static int bch2_mark_stripe(struct btree_trans *trans, struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m || (old_s && !m->alive)) { - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf1), c, old); - bch2_bkey_val_to_text(&PBUF(buf2), c, new); + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" "old %s\n" - "new %s", idx, buf1, buf2); + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); bch2_inconsistent_error(c); return -1; } @@ -1086,10 +1091,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, ((s64) m->sectors * m->nr_redundant), journal_seq, gc); if (ret) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); return ret; } } @@ -1097,19 +1103,19 @@ static int bch2_mark_stripe(struct btree_trans *trans, return 0; } -static int bch2_mark_inode(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; struct bch_fs_usage __percpu *fs_usage; u64 journal_seq = trans->journal_res.seq; if (flags & BTREE_TRIGGER_INSERT) { - struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; + struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_inode_v2); + BUG_ON(new.k->type != KEY_TYPE_inode_v3); v->bi_journal_seq = cpu_to_le64(journal_seq); } @@ -1128,12 +1134,12 @@ static int bch2_mark_inode(struct btree_trans *trans, return 0; } -static int bch2_mark_reservation(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_reservation(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bch_fs_usage __percpu *fs_usage; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; @@ -1160,18 +1166,24 @@ static int bch2_mark_reservation(struct btree_trans *trans, return 0; } -static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, u64 *idx, unsigned flags, size_t r_idx) { + struct bch_fs *c = trans->c; struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; s64 ret = 0; + struct printbuf buf = PRINTBUF; if (r_idx >= c->reflink_gc_nr) goto not_found; r = genradix_ptr(&c->reflink_gc_table, r_idx); - if (*idx < r->offset - r->size) + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) goto not_found; BUG_ON((s64) r->refcount + add < 0); @@ -1180,37 +1192,37 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, *idx = r->offset; return 0; not_found: - *idx = U64_MAX; - ret = -EIO; - - /* - * XXX: we're replacing the entire reflink pointer with an error - * key, we should just be replacing the part that was missing: - */ - if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { + if (fsck_err(c, "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { struct bkey_i_error new; bkey_init(&new.k); new.k.type = KEY_TYPE_error; - new.k.p = p.k->p; - new.k.size = p.k->size; - ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); + new.k.p = bkey_start_pos(p.k); + new.k.p.offset += *idx - start; + bch2_key_resize(&new.k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); } + + *idx = next_idx; fsck_err: + printbuf_exit(&buf); return ret; } -static int bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx); + u64 idx = le64_to_cpu(p.v->idx), start = idx; u64 end = le64_to_cpu(p.v->idx) + p.k->size; int ret = 0; @@ -1234,73 +1246,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, } while (idx < end && !ret) - ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); - - return ret; -} - -int bch2_mark_key(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) -{ - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; - - switch (k.k->type) { - case KEY_TYPE_alloc: - case KEY_TYPE_alloc_v2: - case KEY_TYPE_alloc_v3: - return bch2_mark_alloc(trans, old, new, flags); - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return bch2_mark_extent(trans, old, new, flags); - case KEY_TYPE_stripe: - return bch2_mark_stripe(trans, old, new, flags); - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - return bch2_mark_inode(trans, old, new, flags); - case KEY_TYPE_reservation: - return bch2_mark_reservation(trans, old, new, flags); - case KEY_TYPE_reflink_p: - return bch2_mark_reflink_p(trans, old, new, flags); - case KEY_TYPE_snapshot: - return bch2_mark_snapshot(trans, old, new, flags); - default: - return 0; - } -} - -int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *new, unsigned flags) -{ - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - struct bkey unpacked; - int ret; - - _deleted.p = path->pos; - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc(path->btree_id)) - return 0; - - old = bch2_btree_path_peek_slot(path, &unpacked); - - if (old.k->type == new->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } + ret = __bch2_mark_reflink_p(trans, p, start, end, + &idx, flags, l++); return ret; } @@ -1312,33 +1259,26 @@ void fs_usage_apply_warn(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - char buf[200]; + struct printbuf buf = PRINTBUF; bch_err(c, "disk usage increased %lli more than %u sectors reserved", should_not_have_added, disk_res_sectors); trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + pr_err("while inserting"); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - pr_err("%s", buf); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err(" %s", buf.buf); pr_err("overlapping with"); - - if (!i->cached) { - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - pr_err("%s", buf); - } else { - struct bkey_cached *ck = (void *) i->path->l[0].b; - - if (ck->valid) { - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); - pr_err("%s", buf); - } - } + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, old); + pr_err(" %s", buf.buf); } + __WARN(); + printbuf_exit(&buf); } int bch2_trans_fs_usage_apply(struct btree_trans *trans, @@ -1419,53 +1359,44 @@ need_mark: /* trans_mark: */ -static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - const struct bch_extent_ptr *ptr, - struct bkey_alloc_unpacked *u) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, - POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - *u = bch2_alloc_unpack(k); - return 0; -} - static int bch2_trans_mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + unsigned flags) { + bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct btree_iter iter; - struct bkey_alloc_unpacked u; + struct bkey_i_alloc_v4 *a; + struct bpos bucket_pos; + struct bch_backpointer bp; + s64 sectors; int ret; - ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (ret) - return ret; + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); + sectors = bp.bucket_len; + if (!insert) + sectors = -sectors; - ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, - u.gen, &u.data_type, - &u.dirty_sectors, &u.cached_sectors); - if (ret) - goto out; + a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); + if (IS_ERR(a)) + return PTR_ERR(a); - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors); if (ret) - goto out; -out: + goto err; + + if (!p.ptr.cached) { + ret = insert + ? bch2_bucket_backpointer_add(trans, a, bp, k) + : bch2_bucket_backpointer_del(trans, a, bp, k); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1474,7 +1405,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { - struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bkey_i_stripe *s; @@ -1490,16 +1420,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto err; if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, + bch2_trans_inconsistent(trans, "pointer to nonexistent stripe %llu", (u64) p.ec.idx); - bch2_inconsistent_error(c); ret = -EIO; goto err; } if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { - bch2_fs_inconsistent(c, + bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); ret = -EIO; @@ -1528,10 +1457,15 @@ err: return ret; } -static int bch2_trans_mark_extent(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1556,8 +1490,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, if (flags & BTREE_TRIGGER_OVERWRITE) disk_sectors = -disk_sectors; - ret = bch2_trans_mark_pointer(trans, k, p, - disk_sectors, data_type); + ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); if (ret < 0) return ret; @@ -1593,7 +1526,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; struct btree_iter iter; - struct bkey_alloc_unpacked u; + struct bkey_i_alloc_v4 *a; enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant ? BCH_DATA_parity : 0; s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; @@ -1602,59 +1535,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (deleting) sectors = -sectors; - ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, - u.gen, u.data_type, - u.dirty_sectors, u.cached_sectors); + a->v.gen, a->v.data_type, + a->v.dirty_sectors, a->v.cached_sectors); if (ret) goto err; if (!deleting) { - if (bch2_fs_inconsistent_on(u.stripe || - u.stripe_redundancy, c, + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], - u.dirty_sectors, - u.stripe, s.k->p.offset)) { + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { ret = -EIO; goto err; } - if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], - u.dirty_sectors, + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, s.k->p.offset)) { ret = -EIO; goto err; } - u.stripe = s.k->p.offset; - u.stripe_redundancy = s.v->nr_redundant; + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; } else { - if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || - u.stripe_redundancy != s.v->nr_redundant, c, + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, u.gen, - s.k->p.offset, u.stripe)) { + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { ret = -EIO; goto err; } - u.stripe = 0; - u.stripe_redundancy = 0; + a->v.stripe = 0; + a->v.stripe_redundancy = 0; } - u.dirty_sectors += sectors; + a->v.dirty_sectors += sectors; if (data_type) - u.data_type = !deleting ? data_type : 0; + a->v.data_type = !deleting ? data_type : 0; - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto err; err: @@ -1662,66 +1595,69 @@ err: return ret; } -static int bch2_trans_mark_stripe(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_trans_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { - struct bkey_s_c_stripe old_s = { .k = NULL }; - struct bkey_s_c_stripe new_s = { .k = NULL }; + const struct bch_stripe *old_s = NULL; + struct bch_stripe *new_s = NULL; struct bch_replicas_padded r; unsigned i, nr_blocks; int ret = 0; if (old.k->type == KEY_TYPE_stripe) - old_s = bkey_s_c_to_stripe(old); - if (new.k->type == KEY_TYPE_stripe) - new_s = bkey_s_c_to_stripe(new); + old_s = bkey_s_c_to_stripe(old).v; + if (new->k.type == KEY_TYPE_stripe) + new_s = &bkey_i_to_stripe(new)->v; /* * If the pointers aren't changing, we don't need to do anything: */ - if (new_s.k && old_s.k && - new_s.v->nr_blocks == old_s.v->nr_blocks && - new_s.v->nr_redundant == old_s.v->nr_redundant && - !memcmp(old_s.v->ptrs, new_s.v->ptrs, - new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - BUG_ON(new_s.k && old_s.k && - (new_s.v->nr_blocks != old_s.v->nr_blocks || - new_s.v->nr_redundant != old_s.v->nr_redundant)); + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); - nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; + nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - if (new_s.k) { - s64 sectors = le16_to_cpu(new_s.v->sectors); + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); - bch2_bkey_to_replicas(&r.e, new); - update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); + update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); } - if (old_s.k) { - s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); bch2_bkey_to_replicas(&r.e, old); - update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); } for (i = 0; i < nr_blocks; i++) { - if (new_s.k && old_s.k && - !memcmp(&new_s.v->ptrs[i], - &old_s.v->ptrs[i], - sizeof(new_s.v->ptrs[i]))) + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) continue; - if (new_s.k) { - ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); + if (new_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_i_to_s_c_stripe(new), i, false); if (ret) break; } - if (old_s.k) { - ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); + if (old_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); if (ret) break; } @@ -1730,12 +1666,13 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -static int bch2_trans_mark_inode(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) +int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { - int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); if (nr) { struct replicas_delta_list *d = @@ -1746,9 +1683,15 @@ static int bch2_trans_mark_inode(struct btree_trans *trans, return 0; } -static int bch2_trans_mark_reservation(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; @@ -1776,7 +1719,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), @@ -1796,19 +1739,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, refcount = bkey_refcount(n); if (!refcount) { - bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); - bch2_fs_inconsistent(c, + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf); + *idx, buf.buf); ret = -EIO; goto err; } if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); - bch2_fs_inconsistent(c, + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf); + *idx, buf.buf); ret = -EIO; goto err; } @@ -1830,11 +1773,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, le64_add_cpu(refcount, add); - if (!*refcount) { - n->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&n->k, 0); - } - bch2_btree_iter_set_pos_to_extent_start(&iter); ret = bch2_trans_update(trans, &iter, n, 0); if (ret) @@ -1843,12 +1781,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, *idx = k.k->p.offset; err: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } -static int bch2_trans_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx, end_idx; int ret = 0; @@ -1869,31 +1814,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, return ret; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, - struct bkey_s_c new, unsigned flags) -{ - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return bch2_trans_mark_extent(trans, k, flags); - case KEY_TYPE_stripe: - return bch2_trans_mark_stripe(trans, old, new, flags); - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - return bch2_trans_mark_inode(trans, old, new, flags); - case KEY_TYPE_reservation: - return bch2_trans_mark_reservation(trans, k, flags); - case KEY_TYPE_reflink_p: - return bch2_trans_mark_reflink_p(trans, k, flags); - default: - return 0; - } -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, @@ -1901,11 +1821,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_alloc_unpacked u; - struct bch_extent_ptr ptr = { - .dev = ca->dev_idx, - .offset = bucket_to_sector(ca, b), - }; + struct bkey_i_alloc_v4 *a; int ret = 0; /* @@ -1914,26 +1830,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (b >= ca->mi.nbuckets) return 0; - ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + if (IS_ERR(a)) + return PTR_ERR(a); - if (u.data_type && u.data_type != type) { + if (a->v.data_type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], bch2_data_types[type], bch2_data_types[type]); ret = -EIO; goto out; } - u.data_type = type; - u.dirty_sectors = sectors; + a->v.data_type = type; + a->v.dirty_sectors = sectors; - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto out; out: @@ -1946,7 +1862,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { - return __bch2_trans_do(trans, NULL, NULL, 0, + return commit_do(trans, NULL, NULL, 0, __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); } @@ -2024,8 +1940,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { - return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, - __bch2_trans_mark_dev_sb(&trans, ca)); + return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); } /* Disk reservations: */ @@ -2085,7 +2000,7 @@ recalculate: ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_disk_reservation; } mutex_unlock(&c->sectors_available_lock); @@ -2096,16 +2011,6 @@ recalculate: /* Startup/shutdown: */ -static void buckets_free_rcu(struct rcu_head *rcu) -{ - struct bucket_array *buckets = - container_of(rcu, struct bucket_array, rcu); - - kvpfree(buckets, - sizeof(*buckets) + - buckets->nbuckets * sizeof(struct bucket)); -} - static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = @@ -2116,46 +2021,19 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bucket_array *buckets = NULL, *old_buckets = NULL; struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; - alloc_heap alloc_heap; - - size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); - size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), - btree_reserve * 2); - bool resize = ca->buckets[0] != NULL; + bool resize = ca->bucket_gens != NULL; int ret = -ENOMEM; - unsigned i; - - memset(&free, 0, sizeof(free)); - memset(&free_inc, 0, sizeof(free_inc)); - memset(&alloc_heap, 0, sizeof(alloc_heap)); - if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + - nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO)) || - !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO)) || (c->opts.buckets_nouse && !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO))) || - !init_fifo(&free[RESERVE_MOVINGGC], - copygc_reserve, GFP_KERNEL) || - !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) + GFP_KERNEL|__GFP_ZERO)))) goto err; - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = nbuckets; bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -2167,15 +2045,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) percpu_down_write(&c->mark_lock); } - old_buckets = bucket_array(ca); old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(buckets->nbuckets, old_buckets->nbuckets); + size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - memcpy(buckets->b, - old_buckets->b, - n * sizeof(struct bucket)); memcpy(bucket_gens->b, old_bucket_gens->b, n); @@ -2185,47 +2059,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets[0], buckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); - buckets = old_buckets; bucket_gens = old_bucket_gens; swap(ca->buckets_nouse, buckets_nouse); + nbuckets = ca->mi.nbuckets; + if (resize) { percpu_up_write(&c->mark_lock); + up_write(&ca->bucket_lock); up_write(&c->gc_lock); } - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - fifo_move(&free[i], &ca->free[i]); - swap(ca->free[i], free[i]); - } - fifo_move(&free_inc, &ca->free_inc); - swap(ca->free_inc, free_inc); - spin_unlock(&c->freelist_lock); - - /* with gc lock held, alloc_heap can't be in use: */ - swap(ca->alloc_heap, alloc_heap); - - nbuckets = ca->mi.nbuckets; - - if (resize) - up_write(&ca->bucket_lock); - ret = 0; err: - free_heap(&alloc_heap); - free_fifo(&free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); - if (buckets) - call_rcu(&buckets->rcu, buckets_free_rcu); return ret; } @@ -2234,17 +2086,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->alloc_heap); - free_fifo(&ca->free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), sizeof(struct bucket_gens) + ca->mi.nbuckets); - kvpfree(rcu_dereference_protected(ca->buckets[0], 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); for (i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); @@ -2265,5 +2110,5 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) return -ENOMEM; } - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 7c6c59c..56c06cc 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -9,58 +9,39 @@ #define _BUCKETS_H #include "buckets_types.h" +#include "extents.h" #include "super.h" #define for_each_bucket(_b, _buckets) \ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -#define bucket_cmpxchg(g, new, expr) \ -({ \ - struct bucket *_g = g; \ - u64 _v = atomic64_read(&(g)->_mark.v); \ - struct bucket_mark _old; \ - \ - do { \ - (new).v.counter = _old.v.counter = _v; \ - expr; \ - } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ - _old.v.counter, \ - (new).v.counter)) != _old.v.counter);\ - _old; \ -}) - -static inline struct bucket_array *__bucket_array(struct bch_dev *ca, - bool gc) +static inline void bucket_unlock(struct bucket *b) { - return rcu_dereference_check(ca->buckets[gc], - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || - lockdep_is_held(&ca->bucket_lock)); + smp_store_release(&b->lock, 0); } -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline void bucket_lock(struct bucket *b) { - return __bucket_array(ca, false); + while (xchg(&b->lock, 1)) + cpu_relax(); } -static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) +static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) { - struct bucket_array *buckets = __bucket_array(ca, gc); - - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); - return buckets->b + b; + return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); } static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return __bucket(ca, b, true); -} + struct bucket_array *buckets = gc_bucket_array(ca); -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -{ - return __bucket(ca, b, false); + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) @@ -70,7 +51,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); - } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) @@ -81,20 +61,27 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bucket *g) +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - return g->mark.gen - g->oldest_gen; + return sector_to_bucket(ca, ptr->offset); } -static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, +static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, const struct bch_extent_ptr *ptr) { - return sector_to_bucket(ca, ptr->offset); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, + const struct bch_extent_ptr *ptr, + u32 *bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); } static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, @@ -106,13 +93,22 @@ static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, static inline enum bch_data_type ptr_data_type(const struct bkey *k, const struct bch_extent_ptr *ptr) { - if (k->type == KEY_TYPE_btree_ptr || - k->type == KEY_TYPE_btree_ptr_v2) + if (bkey_is_btree_ptr(k)) return BCH_DATA_btree; return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } +static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +{ + EBUG_ON(sectors < 0); + + return crc_is_compressed(p.crc) + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; +} + static inline int gen_cmp(u8 a, u8 b) { return (s8) (a - b); @@ -141,62 +137,73 @@ static inline u8 ptr_stale(struct bch_dev *ca, return ret; } -/* bucket gc marks */ +/* Device usage: */ -static inline bool is_available_bucket(struct bucket_mark mark) +void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); +static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { - return !mark.dirty_sectors && !mark.stripe; -} + struct bch_dev_usage ret; -/* Device usage: */ + bch2_dev_usage_read_fast(ca, &ret); + return ret; +} -struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); +void bch2_dev_usage_init(struct bch_dev *); -static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage stats) +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve) { - u64 total = ca->mi.nbuckets - ca->mi.first_bucket; - - if (WARN_ONCE(stats.buckets_unavailable > total, - "buckets_unavailable overflow (%llu > %llu)\n", - stats.buckets_unavailable, total)) - return 0; - - return total - stats.buckets_unavailable; + s64 reserved = 0; + + switch (reserve) { + case RESERVE_none: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case RESERVE_movinggc: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree_movinggc: + break; + } + + return reserved; } -static inline u64 dev_buckets_available(struct bch_dev *ca) +static inline u64 dev_buckets_free(struct bch_dev *ca, + struct bch_dev_usage usage, + enum alloc_reserve reserve) { - return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets - + ca->nr_open_buckets - + bch2_dev_buckets_reserved(ca, reserve)); } -static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, - struct bch_dev_usage stats) +static inline u64 __dev_buckets_available(struct bch_dev *ca, + struct bch_dev_usage usage, + enum alloc_reserve reserve) { - struct bch_fs *c = ca->fs; - s64 available = __dev_buckets_available(ca, stats); - unsigned i; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - available -= fifo_used(&ca->free[i]); - available -= fifo_used(&ca->free_inc); - available -= ca->nr_open_buckets; - spin_unlock(&c->freelist_lock); - - return max(available, 0LL); + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets + + usage.d[BCH_DATA_cached].buckets + + usage.d[BCH_DATA_need_gc_gens].buckets + + usage.d[BCH_DATA_need_discard].buckets + - ca->nr_open_buckets + - bch2_dev_buckets_reserved(ca, reserve)); } -static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum alloc_reserve reserve) { - return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); } /* Filesystem usage: */ static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + READ_ONCE(c->replicas.nr); } @@ -224,18 +231,23 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); -void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - size_t, enum bch_data_type, unsigned, - struct gc_pos, unsigned); +int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); -int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_update(struct btree_trans *, struct btree_path *, - struct bkey_i *, unsigned); +int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, - struct bkey_s_c, unsigned); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 2c73dc6..1dbba7d 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -7,32 +7,15 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 -struct bucket_mark { - union { - atomic64_t v; - - struct { - u8 gen; - u8 data_type:3, - owned_by_allocator:1, - stripe:1; - u16 dirty_sectors; - u16 cached_sectors; - }; - }; -}; - struct bucket { - union { - struct bucket_mark _mark; - const struct bucket_mark mark; - }; - - u64 io_time[2]; - u8 oldest_gen; - unsigned gen_valid:1; - u8 stripe_redundancy; - u32 stripe; + u8 lock; + u8 gen_valid:1; + u8 data_type:7; + u8 gen; + u8 stripe_redundancy; + u32 stripe; + u32 dirty_sectors; + u32 cached_sectors; }; struct bucket_array { @@ -51,7 +34,6 @@ struct bucket_gens { struct bch_dev_usage { u64 buckets_ec; - u64 buckets_unavailable; struct { u64 buckets; @@ -111,9 +93,9 @@ struct copygc_heap_entry { u8 dev; u8 gen; u8 replicas; - u16 fragmentation; + u32 fragmentation; u32 sectors; - u64 offset; + u64 bucket; }; typedef HEAP(struct copygc_heap_entry) copygc_heap; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index aa26588..dbb7e5e 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -501,13 +501,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; - arg.ec_buckets = src.buckets_ec; - arg.ec_sectors = 0; + arg.buckets_ec = src.buckets_ec; for (i = 0; i < BCH_DATA_NR; i++) { - arg.buckets[i] = src.d[i].buckets; - arg.sectors[i] = src.d[i].sectors; + arg.d[i].buckets = src.d[i].buckets; + arg.d[i].sectors = src.d[i].sectors; + arg.d[i].fragmented = src.d[i].fragmented; } percpu_ref_put(&ca->ref); diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index a1d8992..3268e8d 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "checksum.h" +#include "errcode.h" #include "super.h" #include "super-io.h" @@ -93,9 +94,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - struct scatterlist *sg, size_t len) +static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; @@ -104,17 +105,51 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); - BUG_ON(ret); + if (ret) + pr_err("got error %i from crypto_skcipher_encrypt()", ret); + + return ret; } -static inline void do_encrypt(struct crypto_sync_skcipher *tfm, +static inline int do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { - struct scatterlist sg; + if (!is_vmalloc_addr(buf)) { + struct scatterlist sg; + + sg_init_table(&sg, 1); + sg_set_page(&sg, + is_vmalloc_addr(buf) + ? vmalloc_to_page(buf) + : virt_to_page(buf), + len, offset_in_page(buf)); + return do_encrypt_sg(tfm, nonce, &sg, len); + } else { + unsigned pages = buf_pages(buf, len); + struct scatterlist *sg; + size_t orig_len = len; + int ret, i; + + sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); + if (!sg) + return -ENOMEM; + + sg_init_table(sg, pages); + + for (i = 0; i < pages; i++) { + unsigned offset = offset_in_page(buf); + unsigned pg_len = min(len, PAGE_SIZE - offset); + + sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); + buf += pg_len; + len -= pg_len; + } - sg_init_one(&sg, buf, len); - do_encrypt_sg(tfm, nonce, &sg, len); + ret = do_encrypt_sg(tfm, nonce, sg, orig_len); + kfree(sg); + return ret; + } } int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, @@ -136,25 +171,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, goto err; } - do_encrypt(chacha20, nonce, buf, len); + ret = do_encrypt(chacha20, nonce, buf, len); err: crypto_free_sync_skcipher(chacha20); return ret; } -static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, - struct nonce nonce) +static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, + struct nonce nonce) { u8 key[POLY1305_KEY_SIZE]; + int ret; nonce.d[3] ^= BCH_NONCE_POLY; memset(key, 0, sizeof(key)); - do_encrypt(c->chacha20, nonce, key, sizeof(key)); + ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); + if (ret) + return ret; desc->tfm = c->poly1305; crypto_shash_init(desc); crypto_shash_update(desc, key, sizeof(key)); + return 0; } struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -196,13 +235,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, } } -void bch2_encrypt(struct bch_fs *c, unsigned type, +int bch2_encrypt(struct bch_fs *c, unsigned type, struct nonce nonce, void *data, size_t len) { if (!bch2_csum_type_is_encryption(type)) - return; + return 0; - do_encrypt(c->chacha20, nonce, data, len); + return do_encrypt(c->chacha20, nonce, data, len); } static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -277,23 +316,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, return __bch2_checksum_bio(c, type, nonce, bio, &iter); } -void bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) +int bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) { struct bio_vec bv; struct bvec_iter iter; struct scatterlist sgl[16], *sg = sgl; size_t bytes = 0; + int ret = 0; if (!bch2_csum_type_is_encryption(type)) - return; + return 0; sg_init_table(sgl, ARRAY_SIZE(sgl)); bio_for_each_segment(bv, bio, iter) { if (sg == sgl + ARRAY_SIZE(sgl)) { sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + + ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (ret) + return ret; nonce = nonce_add(nonce, bytes); bytes = 0; @@ -307,7 +350,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, } sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, @@ -383,8 +426,17 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, merged = bch2_checksum_bio(c, crc_old.csum_type, extent_nonce(version, crc_old), bio); - if (bch2_crc_cmp(merged, crc_old.csum)) + if (bch2_crc_cmp(merged, crc_old.csum)) { + bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo, + bch2_csum_types[crc_old.csum_type], + bch2_csum_types[new_csum_type]); return -EIO; + } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { if (i->crc) @@ -413,7 +465,7 @@ static int __bch2_request_key(char *key_description, struct bch_key *key) const struct user_key_payload *ukp; int ret; - keyring_key = request_key(&key_type_logon, key_description, NULL); + keyring_key = request_key(&key_type_user, key_description, NULL); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -451,13 +503,15 @@ static int __bch2_request_key(char *key_description, struct bch_key *key) int bch2_request_key(struct bch_sb *sb, struct bch_key *key) { - char key_description[60]; - char uuid[40]; + struct printbuf key_description = PRINTBUF; + int ret; - uuid_unparse_lower(sb->user_uuid.b, uuid); - sprintf(key_description, "bcachefs:%s", uuid); + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); - return __bch2_request_key(key_description, key); + ret = __bch2_request_key(key_description.buf, key); + printbuf_exit(&key_description); + return ret; } int bch2_decrypt_sb_key(struct bch_fs *c, @@ -474,7 +528,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { - bch_err(c, "error requesting encryption key: %i", ret); + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); goto err; } @@ -499,20 +553,24 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { + int ret; + if (!c->chacha20) c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); - if (IS_ERR(c->chacha20)) { - bch_err(c, "error requesting chacha20 module: %li", - PTR_ERR(c->chacha20)); - return PTR_ERR(c->chacha20); + ret = PTR_ERR_OR_ZERO(c->chacha20); + + if (ret) { + bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); + return ret; } if (!c->poly1305) c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); - if (IS_ERR(c->poly1305)) { - bch_err(c, "error requesting poly1305 module: %li", - PTR_ERR(c->poly1305)); - return PTR_ERR(c->poly1305); + ret = PTR_ERR_OR_ZERO(c->poly1305); + + if (ret) { + bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); + return ret; } return 0; @@ -573,7 +631,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) if (keyed) { ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { - bch_err(c, "error requesting encryption key: %i", ret); + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); goto err; } @@ -625,9 +683,9 @@ int bch2_fs_encryption_init(struct bch_fs *c) pr_verbose_init(c->opts, ""); c->sha256 = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(c->sha256)) { - bch_err(c, "error requesting sha256 module"); - ret = PTR_ERR(c->sha256); + ret = PTR_ERR_OR_ZERO(c->sha256); + if (ret) { + bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); goto out; } diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index f5c1a60..c86c3c0 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); -void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, +int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, @@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, struct bch_extent_crc_unpacked *, unsigned, unsigned, unsigned); -void bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); +int bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 4324cfe..f3ffdbc 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -161,7 +161,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) - pr_buf(out, "%ps:\t%li\n", + prt_printf(out, "%ps:\t%li\n", clock->timers.data[i]->fn, clock->timers.data[i]->expire - now); spin_unlock(&clock->timer_lock); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 8e4179d..2b7080b 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -197,9 +197,9 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, goto err; workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); - ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = ZSTD_decompressDCtx(ctx, + ret = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); @@ -333,8 +333,8 @@ static int attempt_compress(struct bch_fs *c, return strm.total_out; } case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, - ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, + zstd_cctx_workspace_bound(&c->zstd_params.cParams)); /* * ZSTD requires that when we decompress we pass in the exact @@ -347,11 +347,11 @@ static int attempt_compress(struct bch_fs *c, * factor (7 bytes) from the dst buffer size to account for * that. */ - size_t len = ZSTD_compressCCtx(ctx, + size_t len = zstd_compress_cctx(ctx, dst + 4, dst_len - 4 - 7, src, src_len, - c->zstd_params); - if (ZSTD_isError(len)) + &c->zstd_params); + if (zstd_is_error(len)) return 0; *((__le32 *) dst) = cpu_to_le32(len); @@ -377,7 +377,7 @@ static unsigned __bio_compress(struct bch_fs *c, /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) - return 0; + return BCH_COMPRESSION_TYPE_incompressible; dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); @@ -546,7 +546,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t decompress_workspace_size = 0; bool decompress_workspace_needed; - ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0); + ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); struct { unsigned feature; unsigned type; @@ -558,8 +558,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - ZSTD_CCtxWorkspaceBound(params.cParams), - ZSTD_DCtxWorkspaceBound() }, + zstd_cctx_workspace_bound(¶ms.cParams), + zstd_dctx_workspace_bound() }, }, *i; int ret = 0; diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c new file mode 100644 index 0000000..edd1b25 --- /dev/null +++ b/libbcachefs/counters.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "super-io.h" +#include "counters.h" + +/* BCH_SB_FIELD_counters */ + +const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; + +static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) +{ + if (!ctrs) + return 0; + + return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; +}; + +static int bch2_sb_counters_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + return 0; +}; + +void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + for (i = 0; i < nr; i++) { + if (i < BCH_COUNTER_NR) + prt_printf(out, "%s ", bch2_counter_names[i]); + else + prt_printf(out, "(unknown)"); + + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); + prt_newline(out); + }; +}; + +int bch2_sb_counters_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + u64 val = 0; + + for (i = 0; i < BCH_COUNTER_NR; i++) + c->counters_on_mount[i] = 0; + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { + val = le64_to_cpu(ctrs->d[i]); + percpu_u64_set(&c->counters[i], val); + c->counters_on_mount[i] = val; + } + return 0; +}; + +int bch2_sb_counters_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + struct bch_sb_field_counters *ret; + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + if (nr < BCH_COUNTER_NR) { + ret = bch2_sb_resize_counters(&c->disk_sb, + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); + + if (ret) { + ctrs = ret; + nr = bch2_sb_counter_nr_entries(ctrs); + } + } + + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) + ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + return 0; +} + +void bch2_fs_counters_exit(struct bch_fs *c) +{ + free_percpu(c->counters); +} + +int bch2_fs_counters_init(struct bch_fs *c) +{ + c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); + if (!c->counters) + return -ENOMEM; + + return bch2_sb_counters_to_cpu(c); +} + +const struct bch_sb_field_ops bch_sb_field_ops_counters = { + .validate = bch2_sb_counters_validate, + .to_text = bch2_sb_counters_to_text, +}; diff --git a/libbcachefs/counters.h b/libbcachefs/counters.h new file mode 100644 index 0000000..4778aa1 --- /dev/null +++ b/libbcachefs/counters.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COUNTERS_H +#define _BCACHEFS_COUNTERS_H + +#include "bcachefs.h" +#include "super-io.h" + + +int bch2_sb_counters_to_cpu(struct bch_fs *); +int bch2_sb_counters_from_cpu(struct bch_fs *); + +void bch2_fs_counters_exit(struct bch_fs *); +int bch2_fs_counters_init(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_counters; + +#endif // _BCACHEFS_COUNTERS_H diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h new file mode 100644 index 0000000..519ab9b --- /dev/null +++ b/libbcachefs/darray.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H + +/* + * Dynamic arrays: + * + * Inspired by CCAN's darray + */ + +#include "util.h" +#include + +#define DARRAY(type) \ +struct { \ + size_t nr, size; \ + type *data; \ +} + +typedef DARRAY(void) darray_void; + +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +{ + if (d->nr + more > d->size) { + size_t new_size = roundup_pow_of_two(d->nr + more); + void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} + +#define darray_make_room(_d, _more) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) + +#define darray_top(_d) ((_d).data[(_d).nr]) + +#define darray_push(_d, _item) \ +({ \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + (_d)->data[(_d)->nr++] = (_item); \ + _ret; \ +}) + +#define darray_insert_item(_d, _pos, _item) \ +({ \ + size_t pos = (_pos); \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ + _ret; \ +}) + +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + +#define darray_init(_d) \ +do { \ + (_d)->data = NULL; \ + (_d)->nr = (_d)->size = 0; \ +} while (0) + +#define darray_exit(_d) \ +do { \ + kfree((_d)->data); \ + darray_init(_d); \ +} while (0) + +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c new file mode 100644 index 0000000..b75ff07 --- /dev/null +++ b/libbcachefs/data_update.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "data_update.h" +#include "ec.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "subvolume.h" + +#include + +static int insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; + snapshot_id_list s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + + darray_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; + + if (!snapshot_t(c, old_pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (bkey_cmp(old_pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; + + if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) + continue; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = new_pos; + update->k.p.snapshot = k.k->p.snapshot; + + bch2_trans_iter_init(trans, &update_iter, id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + break; + + ret = snapshot_list_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + darray_exit(&s); + + return ret; +} + +static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + ptr->cached = true; +} + +static int bch2_data_update_index_update(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct data_update *m = + container_of(op, struct data_update, op); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; + + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + bch2_trans_iter_init(&trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { + struct bkey_s_c k; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + struct bkey_i *insert; + struct bkey_i_extent *new; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bpos next_pos; + bool did_work = false; + bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + unsigned i; + + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (!bch2_extents_match(k, old)) + goto nomatch; + + bkey_reassemble(_insert.k, k); + insert = _insert.k; + + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); + bch2_cut_front(iter.pos, &new->k_i); + + bch2_cut_front(iter.pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + + /* + * @old: extent that we read from + * @insert: key that we're going to update, initialized from + * extent currently in btree - same as @old unless we raced with + * other updates + * @new: extent with new pointers that we'll be adding to @insert + * + * Fist, drop rewrite_ptrs from @new: + */ + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) { + /* + * If we're going to be adding a pointer to the + * same device, we have to drop the old one - + * otherwise, we can just mark it cached: + */ + if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev)) + bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev); + else + bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev); + } + i++; + } + + + /* Add new ptrs: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { + /* + * raced with another move op? extent already + * has a pointer to the device we just wrote + * data to + */ + continue; + } + + bch2_extent_ptr_decoded_append(insert, &p); + did_work = true; + } + + if (!did_work) + goto nomatch; + + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + + ret = bch2_sum_sector_overwrites(&trans, &iter, insert, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; + + if (disk_sectors_delta > (s64) op->res.sectors) { + ret = bch2_disk_reservation_add(c, &op->res, + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto out; + } + + next_pos = insert->k.p; + + ret = insert_snapshot_whiteouts(&trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); + trace_move_extent_finish(&new->k); + } +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; +next: + while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; + } + continue; +nomatch: + if (m->ctxt) { + BUG_ON(k.k->p.offset <= iter.pos.offset); + atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_add(k.k->p.offset - iter.pos.offset, + &m->ctxt->stats->sectors_raced); + } + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size); + trace_move_extent_race(&new->k); + + bch2_btree_iter_advance(&iter); + goto next; + } +out: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + return ret; +} + +void bch2_data_update_read_done(struct data_update *m, + struct bch_extent_crc_unpacked crc, + struct closure *cl) +{ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->op.crc = crc; + m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, cl); +} + +void bch2_data_update_exit(struct data_update *update) +{ + struct bch_fs *c = update->op.c; + + bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); +} + +int bch2_data_update_init(struct bch_fs *c, struct data_update *m, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct data_update_opts data_opts, + enum btree_id btree_id, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + int ret; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); + m->btree_id = btree_id; + m->data_opts = data_opts; + + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->version; + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL| + m->data_opts.write_flags; + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) + m->op.alloc_reserve = RESERVE_movinggc; + m->op.index_update_fn = bch2_data_update_index_update; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + p.ptr.cached) + BUG(); + + if (!((1U << i) & m->data_opts.rewrite_ptrs)) + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + + if (((1U << i) & m->data_opts.rewrite_ptrs) && + crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + /* + * op->csum_type is normally initialized from the fs/file's + * current options - but if an extent is encrypted, we require + * that it stays encrypted: + */ + if (bch2_csum_type_is_encryption(p.crc.csum_type)) { + m->op.nonce = p.crc.nonce + p.crc.offset; + m->op.csum_type = p.crc.csum_type; + } + + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + + i++; + } + + if (reserve_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, + m->data_opts.extra_replicas + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + + m->op.nr_replicas = m->op.nr_replicas_required = + hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas; + + BUG_ON(!m->op.nr_replicas); + return 0; +} + +void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { + opts->kill_ptrs |= 1U << i; + opts->rewrite_ptrs ^= 1U << i; + } + + i++; + } +} diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h new file mode 100644 index 0000000..6793aa5 --- /dev/null +++ b/libbcachefs/data_update.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H +#define _BCACHEFS_DATA_UPDATE_H + +#include "bkey_buf.h" +#include "io_types.h" + +struct moving_context; + +struct data_update_opts { + unsigned rewrite_ptrs; + unsigned kill_ptrs; + u16 target; + u8 extra_replicas; + unsigned btree_insert_flags; + unsigned write_flags; +}; + +struct data_update { + /* extent being updated: */ + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_write_op op; +}; + +void bch2_data_update_read_done(struct data_update *, + struct bch_extent_crc_unpacked, + struct closure *); + +void bch2_data_update_exit(struct data_update *); +int bch2_data_update_init(struct bch_fs *, struct data_update *, + struct write_point_specifier, + struct bch_io_opts, struct data_update_opts, + enum btree_id, struct bkey_s_c); +void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); + +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index ee5b7f6..57602c8 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -11,6 +11,7 @@ #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" +#include "btree_locking.h" #include "btree_update.h" #include "buckets.h" #include "debug.h" @@ -24,6 +25,7 @@ #include #include #include +#include #include #include @@ -43,11 +45,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, if (!bch2_dev_get_ioref(ca, READ)) return false; - bio = bio_alloc_bioset(GFP_NOIO, - buf_pages(n_sorted, btree_bytes(c)), - &c->btree_bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_READ|REQ_META; + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_sorted, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOIO, + &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; bch2_bio_map(bio, n_sorted, btree_bytes(c)); @@ -169,10 +171,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) failed |= bch2_btree_verify_replica(c, b, p); if (failed) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + printbuf_exit(&buf); } out: mutex_unlock(&c->verify_lock); @@ -184,23 +187,24 @@ out: /* XXX: bch_fs refcounting */ struct dump_iter { - struct bpos from; - struct bch_fs *c; + struct bch_fs *c; enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; - char buf[1 << 12]; - size_t bytes; /* what's currently in buf */ + struct printbuf buf; char __user *ubuf; /* destination user buffer */ size_t size; /* size of requested read */ ssize_t ret; /* bytes read so far */ }; -static int flush_buf(struct dump_iter *i) +static ssize_t flush_buf(struct dump_iter *i) { - if (i->bytes) { - size_t bytes = min(i->bytes, i->size); - int err = copy_to_user(i->ubuf, i->buf, bytes); + if (i->buf.pos) { + size_t bytes = min_t(size_t, i->buf.pos, i->size); + int err = copy_to_user(i->ubuf, i->buf.buf, bytes); if (err) return err; @@ -208,11 +212,11 @@ static int flush_buf(struct dump_iter *i) i->ret += bytes; i->ubuf += bytes; i->size -= bytes; - i->bytes -= bytes; - memmove(i->buf, i->buf + bytes, i->bytes); + i->buf.pos -= bytes; + memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); } - return 0; + return i->size ? 0 : i->ret; } static int bch2_dump_open(struct inode *inode, struct file *file) @@ -226,15 +230,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->from = POS_MIN; + i->iter = 0; i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); i->id = bd->id; + i->buf = PRINTBUF; return 0; } static int bch2_dump_release(struct inode *inode, struct file *file) { - kfree(file->private_data); + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); return 0; } @@ -245,48 +254,33 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - int err; + ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - err = flush_buf(i); - if (err) - return err; - - if (!i->size) - return i->ret; - bch2_trans_init(&trans, i->c, 0, 0); - bch2_trans_iter_init(&trans, &iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(&iter); - - while (k.k && !(err = bkey_err(k))) { - bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); - i->bytes = strlen(i->buf); - BUG_ON(i->bytes >= sizeof(i->buf)); - i->buf[i->bytes] = '\n'; - i->bytes++; - - k = bch2_btree_iter_next(&iter); - i->from = iter.pos; - - err = flush_buf(i); - if (err) + ret = for_each_btree_key2(&trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = flush_buf(i); + if (ret) break; - if (!i->size) - break; - } - bch2_trans_iter_exit(&trans, &iter); + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + 0; + })); + i->from = iter.pos; + + if (!ret) + ret = flush_buf(i); bch2_trans_exit(&trans); - return err < 0 ? err : i->ret; + return ret ?: i->ret; } static const struct file_operations btree_debug_ops = { @@ -303,44 +297,39 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, struct btree_trans trans; struct btree_iter iter; struct btree *b; - int err; + ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - err = flush_buf(i); - if (err) - return err; + ret = flush_buf(i); + if (ret) + return ret; - if (!i->size || !bpos_cmp(SPOS_MAX, i->from)) + if (!bpos_cmp(SPOS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); - for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { - bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); - i->bytes = strlen(i->buf); - err = flush_buf(i); - if (err) + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { + ret = flush_buf(i); + if (ret) break; - /* - * can't easily correctly restart a btree node traversal across - * all nodes, meh - */ + bch2_btree_node_to_text(&i->buf, i->c, b); i->from = bpos_cmp(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; - - if (!i->size) - break; } bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return err < 0 ? err : i->ret; + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; } static const struct file_operations btree_format_debug_ops = { @@ -357,75 +346,398 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct btree *prev_node = NULL; - int err; + ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - err = flush_buf(i); - if (err) - return err; - - if (!i->size) - return i->ret; + ret = flush_buf(i); + if (ret) + return ret; bch2_trans_init(&trans, i->c, 0, 0); - bch2_trans_iter_init(&trans, &iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - while ((k = bch2_btree_iter_peek(&iter)).k && - !(err = bkey_err(k))) { + ret = for_each_btree_key2(&trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct btree_path_level *l = &iter.path->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (l->b != prev_node) { - bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); - i->bytes = strlen(i->buf); - err = flush_buf(i); - if (err) - break; + ret = flush_buf(i); + if (ret) + break; + + if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; } - prev_node = l->b; - bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); - i->bytes = strlen(i->buf); + bch2_bfloat_to_text(&i->buf, l->b, _k); + 0; + })); + i->from = iter.pos; + + bch2_trans_exit(&trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations bfloat_failed_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_bfloat_failed, +}; + +static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "%px btree=%s l=%u ", + b, + bch2_btree_ids[b->c.btree_id], + b->c.level); + prt_newline(out); + + printbuf_indent_add(out, 2); + + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); + + prt_printf(out, "flags: "); + prt_tab(out); + prt_bitflags(out, bch2_btree_node_flags, b->flags); + prt_newline(out); + + prt_printf(out, "pcpu read locks: "); + prt_tab(out); + prt_printf(out, "%u", b->c.lock.readers != NULL); + prt_newline(out); + + prt_printf(out, "written:"); + prt_tab(out); + prt_printf(out, "%u", b->written); + prt_newline(out); + + prt_printf(out, "writes blocked:"); + prt_tab(out); + prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); + prt_newline(out); + + prt_printf(out, "will make reachable:"); + prt_tab(out); + prt_printf(out, "%lx", b->will_make_reachable); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[0].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[0].journal.seq); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[1].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[1].journal.seq); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + + ret = flush_buf(i); + if (ret) + return ret; + + rcu_read_lock(); + i->buf.atomic++; + tbl = rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } + --i->buf.atomic; + rcu_read_unlock(); + } while (!done); + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations cached_btree_nodes_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_cached_btree_nodes_read, +}; + +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + ret = flush_buf(i); + if (ret) + return ret; + + bch2_btree_trans_to_text(&i->buf, trans); + + prt_printf(&i->buf, "backtrace:"); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + bch2_prt_backtrace(&i->buf, trans->locking_wait.task); + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + + i->iter = trans->locking_wait.task->pid; + } + mutex_unlock(&c->btree_trans_lock); + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_transactions_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_transactions_read, +}; + +static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { err = flush_buf(i); if (err) + return err; + + if (!i->size) break; - bch2_btree_iter_advance(&iter); - i->from = iter.pos; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); + i->iter++; + } while (!done); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations journal_pins_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_journal_pins_read, +}; + +static int lock_held_stats_open(struct inode *inode, struct file *file) +{ + struct bch_fs *c = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + + if (!i) + return -ENOMEM; + + i->iter = 0; + i->c = c; + i->buf = PRINTBUF; + file->private_data = i; + + return 0; +} + +static int lock_held_stats_release(struct inode *inode, struct file *file) +{ + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); + + return 0; +} + +static ssize_t lock_held_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + while (1) { + struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; err = flush_buf(i); if (err) - break; + return err; if (!i->size) break; + + if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || + !bch2_btree_transaction_fns[i->iter]) + break; + + prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + + mutex_lock(&s->lock); + + prt_printf(&i->buf, "Max mem used: %u", s->max_mem); + prt_newline(&i->buf); + + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { + prt_printf(&i->buf, "Lock hold times:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); + printbuf_indent_sub(&i->buf, 2); + } + + if (s->max_paths_text) { + prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + prt_str_indented(&i->buf, s->max_paths_text); + printbuf_indent_sub(&i->buf, 2); + } + + mutex_unlock(&s->lock); + + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + i->iter++; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + if (i->buf.allocation_failure) + return -ENOMEM; - return err < 0 ? err : i->ret; + return i->ret; } -static const struct file_operations bfloat_failed_debug_ops = { +static const struct file_operations lock_held_stats_op = { + .owner = THIS_MODULE, + .open = lock_held_stats_open, + .release = lock_held_stats_release, + .read = lock_held_stats_read, +}; + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (i->iter) + goto out; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + ret = flush_buf(i); + if (ret) + return ret; + + bch2_check_for_deadlock(trans, &i->buf); + + i->iter = trans->locking_wait.task->pid; + } + mutex_unlock(&c->btree_trans_lock); +out: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, - .read = bch2_read_bfloat_failed, + .read = bch2_btree_deadlock_read, }; void bch2_fs_debug_exit(struct bch_fs *c) { - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove_recursive(c->debug); + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) + debugfs_remove_recursive(c->fs_debug_dir); } void bch2_fs_debug_init(struct bch_fs *c) @@ -437,29 +749,48 @@ void bch2_fs_debug_init(struct bch_fs *c) return; snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - c->debug = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->debug)) + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->fs_debug_dir)) + return; + + debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, + c->btree_debug, &cached_btree_nodes_ops); + + debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, + c->btree_debug, &btree_transactions_ops); + + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, + c->btree_debug, &journal_pins_ops); + + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, + c, &lock_held_stats_op); + + debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, + c->btree_debug, &btree_deadlock_ops); + + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; for (bd = c->btree_debug; bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], - 0400, c->debug, bd, - &btree_debug_ops); + debugfs_create_file(bch2_btree_ids[bd->id], + 0400, c->btree_debug_dir, bd, + &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", bch2_btree_ids[bd->id]); - bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, - &btree_format_debug_ops); + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", bch2_btree_ids[bd->id]); - bd->failed = debugfs_create_file(name, 0400, c->debug, bd, - &bfloat_failed_debug_ops); + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &bfloat_failed_debug_ops); } } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 6f699b7..288f46b 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -83,38 +83,58 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); unsigned len; - if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) - return "value too small"; + if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*d.v)); + return -EINVAL; + } len = bch2_dirent_name_bytes(d); - if (!len) - return "empty name"; + if (!len) { + prt_printf(err, "empty name"); + return -EINVAL; + } - if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) - return "value too big"; + if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), dirent_val_u64s(len)); + return -EINVAL; + } - if (len > BCH_NAME_MAX) - return "dirent name too big"; + if (len > BCH_NAME_MAX) { + prt_printf(err, "dirent name too big (%u > %u)", + len, BCH_NAME_MAX); + return -EINVAL; + } - if (len == 1 && !memcmp(d.v->d_name, ".", 1)) - return "invalid name"; + if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { + prt_printf(err, "invalid name"); + return -EINVAL; + } - if (len == 2 && !memcmp(d.v->d_name, "..", 2)) - return "invalid name"; + if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { + prt_printf(err, "invalid name"); + return -EINVAL; + } - if (memchr(d.v->d_name, '/', len)) - return "invalid name"; + if (memchr(d.v->d_name, '/', len)) { + prt_printf(err, "invalid name"); + return -EINVAL; + } if (d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode) - return "dirent points to own directory"; + le64_to_cpu(d.v->d_inum) == d.k->p.inode) { + prt_printf(err, "dirent points to own directory"); + return -EINVAL; + } - return NULL; + return 0; } void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, @@ -122,9 +142,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - bch_scnmemcpy(out, d.v->d_name, - bch2_dirent_name_bytes(d)); - pr_buf(out, " -> %llu type %s", + prt_printf(out, "%.*s -> %llu type %s", + bch2_dirent_name_bytes(d), + d.v->d_name, d.v->d_type != DT_SUBVOL ? le64_to_cpu(d.v->d_inum) : le32_to_cpu(d.v->d_child_subvol), @@ -451,7 +471,7 @@ retry: ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, name, inum, 0); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret) bch2_trans_iter_exit(&trans, &iter); @@ -470,16 +490,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) if (ret) return ret; - for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir.inum, 0, snapshot), 0, k, ret) { - if (k.k->p.inode > dir.inum) - break; - + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), + POS(dir.inum, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { ret = -ENOTEMPTY; break; } - } bch2_trans_iter_exit(trans, &iter); return ret; @@ -503,11 +520,9 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, - SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { - if (k.k->p.inode > inum.inum) - break; - + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), + POS(inum.inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) continue; @@ -541,7 +556,7 @@ retry: } bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 1bb4d80..b146693 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc; -const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent (struct bkey_ops) { \ diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 6c84297..6b81f35 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -39,13 +39,13 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, g = BCH_MEMBER_GROUP(m) - 1; if (g >= nr_groups) { - pr_buf(err, "disk %u has invalid label %u (have %u)", + prt_printf(err, "disk %u has invalid label %u (have %u)", i, g, nr_groups); return -EINVAL; } if (BCH_GROUP_DELETED(&groups->entries[g])) { - pr_buf(err, "disk %u has deleted label %u", i, g); + prt_printf(err, "disk %u has deleted label %u", i, g); return -EINVAL; } } @@ -61,7 +61,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, len = strnlen(g->label, sizeof(g->label)); if (!len) { - pr_buf(err, "label %u empty", i); + prt_printf(err, "label %u empty", i); return -EINVAL; } } @@ -76,8 +76,9 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, for (g = sorted; g + 1 < sorted + nr_groups; g++) if (!BCH_GROUP_DELETED(g) && !group_cmp(&g[0], &g[1])) { - pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g)); - bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label))); + prt_printf(err, "duplicate label %llu.%.*s", + BCH_GROUP_PARENT(g), + (int) sizeof(g->label), g->label); goto err; } @@ -100,12 +101,12 @@ static void bch2_sb_disk_groups_to_text(struct printbuf *out, g < groups->entries + nr_groups; g++) { if (g != groups->entries) - pr_buf(out, " "); + prt_printf(out, " "); if (BCH_GROUP_DELETED(g)) - pr_buf(out, "[deleted]"); + prt_printf(out, "[deleted]"); else - pr_buf(out, "[parent %llu name %s]", + prt_printf(out, "[parent %llu name %s]", BCH_GROUP_PARENT(g), g->label); } } @@ -275,7 +276,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, groups = bch2_sb_resize_disk_groups(sb, u64s); if (!groups) - return -ENOSPC; + return -BCH_ERR_ENOSPC_disk_label_add; nr_groups = disk_groups_nr(groups); } @@ -342,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, - struct bch_sb_handle *sb, - unsigned v) +void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct bch_sb_field_disk_groups *groups = - bch2_sb_get_disk_groups(sb->sb); + bch2_sb_get_disk_groups(sb); struct bch_disk_group *g; unsigned nr = 0; u16 path[32]; @@ -376,43 +375,43 @@ void bch2_disk_path_to_text(struct printbuf *out, v = path[--nr]; g = groups->entries + v; - bch_scnmemcpy(out, g->label, - strnlen(g->label, sizeof(g->label))); - + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); if (nr) - pr_buf(out, "."); + prt_printf(out, "."); } return; inval: - pr_buf(out, "invalid group %u", v); + prt_printf(out, "invalid label %u", v); } -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { struct bch_member *mi; - int v = -1; - int ret = 0; - - mutex_lock(&c->sb_lock); + int ret, v = -1; if (!strlen(name) || !strcmp(name, "none")) - goto write_sb; + return 0; v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) { - mutex_unlock(&c->sb_lock); + if (v < 0) return v; - } ret = bch2_sb_disk_groups_to_cpu(c); if (ret) - goto unlock; -write_sb: + return ret; + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; SET_BCH_MEMBER_GROUP(mi, v + 1); + return 0; +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + int ret; - bch2_write_super(c); -unlock: + mutex_lock(&c->sb_lock); + ret = __bch2_dev_group_set(c, ca, name) ?: + bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; @@ -448,41 +447,57 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) return -EINVAL; } -void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) { struct target t = target_decode(v); switch (t.type) { case TARGET_NULL: - pr_buf(out, "none"); + prt_printf(out, "none"); break; - case TARGET_DEV: { - struct bch_dev *ca; - - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && percpu_ref_tryget(&ca->io_ref)) { - char b[BDEVNAME_SIZE]; - - pr_buf(out, "/dev/%s", - bdevname(ca->disk_sb.bdev, b)); - percpu_ref_put(&ca->io_ref); - } else if (ca) { - pr_buf(out, "offline device %u", t.dev); + case TARGET_DEV: + if (c) { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + prt_printf(out, "offline device %u", t.dev); + } else { + prt_printf(out, "invalid device %u", t.dev); + } + + rcu_read_unlock(); } else { - pr_buf(out, "invalid device %u", t.dev); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + struct bch_member *m = mi->members + t.dev; + + if (bch2_dev_exists(sb, mi, t.dev)) { + prt_printf(out, "Device "); + pr_uuid(out, m->uuid.b); + prt_printf(out, " (%u)", t.dev); + } else { + prt_printf(out, "Bad device %u", t.dev); + } } - - rcu_read_unlock(); break; - } case TARGET_GROUP: - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, &c->disk_sb, t.group); - mutex_unlock(&c->sb_lock); + if (c) { + mutex_lock(&c->sb_lock); + bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); + mutex_unlock(&c->sb_lock); + } else { + bch2_disk_path_to_text(out, sb, t.group); + } break; default: BUG(); diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index 3d84f23..e4470c3 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -75,14 +75,14 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *); /* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, - unsigned); +void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); int bch2_sb_disk_groups_to_cpu(struct bch_fs *); +int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); const char *bch2_sb_validate_disk_groups(struct bch_sb *, diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 9b45640..dfe3796 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_buf.h" #include "bset.h" #include "btree_gc.h" @@ -102,24 +103,34 @@ struct ec_bio { /* Stripes btree keys: */ -const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - if (!bkey_cmp(k.k->p, POS_MIN)) - return "stripe at pos 0"; + if (!bkey_cmp(k.k->p, POS_MIN)) { + prt_printf(err, "stripe at POS_MIN"); + return -EINVAL; + } - if (k.k->p.inode) - return "invalid stripe key"; + if (k.k->p.inode) { + prt_printf(err, "nonzero inode field"); + return -EINVAL; + } - if (bkey_val_bytes(k.k) < sizeof(*s)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) < sizeof(*s)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*s)); + return -EINVAL; + } - if (bkey_val_bytes(k.k) < sizeof(*s) || - bkey_val_u64s(k.k) < stripe_val_u64s(s)) - return "incorrect value size"; + if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { + prt_printf(err, "incorrect value size (%zu < %u)", + bkey_val_u64s(k.k), stripe_val_u64s(s)); + return -EINVAL; + } - return bch2_bkey_ptrs_invalid(c, k); + return bch2_bkey_ptrs_invalid(c, k, rw, err); } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -128,7 +139,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned i; - pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", s->algorithm, le16_to_cpu(s->sectors), s->nr_blocks - s->nr_redundant, @@ -137,7 +148,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 1U << s->csum_granularity_bits); for (i = 0; i < s->nr_blocks; i++) - pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, + prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev, (u64) s->ptrs[i].offset, stripe_blockcount_get(s, i)); } @@ -286,14 +297,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - char buf2[200]; + struct printbuf buf2 = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); bch_err_ratelimited(c, "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", (void *) _RET_IP_, i, j, v->csum_type, - want.lo, got.lo, buf2); + want.lo, got.lo, buf2.buf); + printbuf_exit(&buf2); clear_bit(i, buf->valid); break; } @@ -401,7 +413,10 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, nr_iovecs << PAGE_SHIFT); struct ec_bio *ec_bio; - ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, + ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, + nr_iovecs, + rw, + GFP_KERNEL, &c->ec_bioset), struct ec_bio, bio); @@ -409,9 +424,6 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->buf = buf; ec_bio->idx = idx; - bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); - bio_set_op_attrs(&ec_bio->bio, rw, 0); - ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; ec_bio->bio.bi_private = cl; @@ -561,18 +573,14 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { size_t idx = iter->pos.offset; - int ret = 0; if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) - return ret; + return 0; bch2_trans_unlock(trans); - ret = -EINTR; - if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL)) - return ret; - - return -ENOMEM; + return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?: + bch2_trans_relock(trans); } static ssize_t stripe_idx_to_delete(struct bch_fs *c) @@ -715,7 +723,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans, struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; - for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos, + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { @@ -724,17 +732,18 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans, continue; } - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_stripe_create; break; } if (bkey_deleted(k.k)) - goto found_slot; + break; } - goto err; -found_slot: - start_pos = iter.pos; + c->ec_stripe_hint = iter.pos.offset; + + if (ret) + goto err; ret = ec_stripe_mem_alloc(trans, &iter); if (ret) @@ -743,8 +752,6 @@ found_slot: stripe->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); - - c->ec_stripe_hint = start_pos.offset; err: bch2_trans_iter_exit(trans, &iter); @@ -811,78 +818,111 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, }; } -static int ec_stripe_update_ptrs(struct bch_fs *c, - struct ec_stripe_buf *s, - struct bkey *pos) +static int ec_stripe_update_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct ec_stripe_buf *s) { - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_extent e; - struct bkey_buf sk; - struct bpos next_pos; - int ret = 0, dev, block; + const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bkey_i *n; + int ret, dev, block; + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + return 0; + + ptr_c = bkey_matches_stripe(&s->key.v, k, &block); + /* + * It doesn't generally make sense to erasure code cached ptrs: + * XXX: should we be incrementing a counter? + */ + if (!ptr_c || ptr_c->cached) + return 0; + + dev = s->key.v.ptrs[block].dev; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); + BUG_ON(!ec_ptr); - /* XXX this doesn't support the reflink btree */ + extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - bkey_start_pos(pos), - BTREE_ITER_INTENT); + return bch2_trans_update(trans, iter, n, 0); +} + +static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, + unsigned block) +{ + struct bch_fs *c = trans->c; + struct bch_extent_ptr bucket = s->key.v.ptrs[block]; + struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; + u64 bp_offset = 0; + int ret = 0; retry: - while (bch2_trans_begin(&trans), - (k = bch2_btree_iter_peek(&iter)).k && - !(ret = bkey_err(k)) && - bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { - const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ptr, *ec_ptr = NULL; - - if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { - bch2_btree_iter_advance(&iter); - continue; + while (1) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, + &bp_offset, &bp, + BTREE_ITER_CACHED); + if (ret) + break; + if (bp_offset == U64_MAX) + break; + + if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) { + ret = -EIO; + break; } - ptr_c = bkey_matches_stripe(&s->key.v, k, &block); - /* - * It doesn't generally make sense to erasure code cached ptrs: - * XXX: should we be incrementing a counter? - */ - if (!ptr_c || ptr_c->cached) { - bch2_btree_iter_advance(&iter); + k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp); + ret = bkey_err(k); + if (ret) + break; + if (!k.k) continue; - } - dev = s->key.v.ptrs[block].dev; + ret = ec_stripe_update_extent(trans, &iter, k, s); + bch2_trans_iter_exit(trans, &iter); + if (ret) + break; - bch2_bkey_buf_reassemble(&sk, c, k); - e = bkey_i_to_s_extent(sk.k); + bp_offset++; + } - bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); - ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); - BUG_ON(!ec_ptr); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; - extent_stripe_ptr_add(e, s, ec_ptr, block); + return ret; +} - bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); - next_pos = sk.k->k.p; +static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) +{ + struct btree_trans trans; + struct bch_stripe *v = &s->key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret = 0; - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, sk.k, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - if (!ret) - bch2_btree_iter_set_pos(&iter, next_pos); + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr_data; i++) { + ret = ec_stripe_update_bucket(&trans, s, i); if (ret) break; } - if (ret == -EINTR) - goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); return ret; } @@ -894,7 +934,6 @@ static void ec_stripe_create(struct ec_stripe_new *s) { struct bch_fs *c = s->c; struct open_bucket *ob; - struct bkey_i *k; struct stripe *m; struct bch_stripe *v = &s->new_stripe.key.v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; @@ -928,7 +967,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) BUG_ON(!s->allocated); - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) goto err; ec_generate_ec(&s->new_stripe); @@ -954,13 +993,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err_put_writes; } - for_each_keylist_key(&s->keys, k) { - ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k); - if (ret) { - bch_err(c, "error creating stripe: error %i updating pointers", ret); - break; - } - } + ret = ec_stripe_update_extents(c, &s->new_stripe); + if (ret) + bch_err(c, "error creating stripe: error updating pointers: %s", + bch2_err_str(ret)); spin_lock(&c->ec_stripes_heap_lock); m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); @@ -985,8 +1021,6 @@ err: } } - bch2_keylist_free(&s->keys, s->inline_keys); - ec_stripe_buf_exit(&s->existing_stripe); ec_stripe_buf_exit(&s->new_stripe); closure_debug_destroy(&s->iodone); @@ -1069,30 +1103,6 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } -void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, - struct bkey *k) -{ - struct ec_stripe_new *ec = ob->ec; - - if (!ec) - return; - - mutex_lock(&ec->lock); - - if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, - ARRAY_SIZE(ec->inline_keys), - BKEY_U64s)) { - BUG(); - } - - bkey_init(&ec->keys.top->k); - ec->keys.top->k.p = k->p; - ec->keys.top->k.size = k->size; - bch2_keylist_push(&ec->keys); - - mutex_unlock(&ec->lock); -} - static int unsigned_cmp(const void *_l, const void *_r) { unsigned l = *((const unsigned *) _l); @@ -1185,8 +1195,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) BCH_BKEY_PTRS_MAX) - h->redundancy; s->nr_parity = h->redundancy; - bch2_keylist_init(&s->keys, s->inline_keys); - ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize); @@ -1294,9 +1302,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(nr_have_data > h->s->nr_data); BUG_ON(nr_have_parity > h->s->nr_parity); - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - buckets.nr = 0; if (nr_have_parity < h->s->nr_parity) { ret = bch2_bucket_alloc_set(c, &buckets, @@ -1306,8 +1311,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, &nr_have_parity, &have_cache, h->copygc - ? RESERVE_MOVINGGC - : RESERVE_NONE, + ? RESERVE_movinggc + : RESERVE_none, 0, cl); @@ -1323,7 +1328,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } buckets.nr = 0; @@ -1335,8 +1340,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, &nr_have_data, &have_cache, h->copygc - ? RESERVE_MOVINGGC - : RESERVE_NONE, + ? RESERVE_movinggc + : RESERVE_none, 0, cl); @@ -1351,12 +1356,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } -err: - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; + + return 0; } /* XXX: doesn't obey target: */ @@ -1402,10 +1405,8 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, int ret; idx = get_existing_stripe(c, h); - if (idx < 0) { - bch_err(c, "failed to find an existing stripe"); - return -ENOSPC; - } + if (idx < 0) + return -BCH_ERR_ENOSPC_stripe_reuse; h->s->have_existing_stripe = true; ret = get_stripe_key(c, idx, &h->s->existing_stripe); @@ -1443,21 +1444,9 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, struct ec_stripe_head *h) { - int ret; - - ret = bch2_disk_reservation_get(c, &h->s->res, - h->blocksize, - h->s->nr_parity, 0); - - if (ret) { - /* - * This means we need to wait for copygc to - * empty out buckets from existing stripes: - */ - bch_err(c, "failed to reserve stripe"); - } - - return ret; + return bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, 0); } struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, @@ -1499,8 +1488,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ret = __bch2_ec_stripe_head_reserve(c, h); if (ret && needs_stripe_new) ret = __bch2_ec_stripe_head_reuse(c, h); - if (ret) + if (ret) { + bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret)); goto err; + } if (!h->s->allocated) { ret = new_stripe_alloc_buckets(c, h, cl); @@ -1616,7 +1607,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) for (i = 0; i < min_t(size_t, h->used, 20); i++) { m = genradix_ptr(&c->stripes, h->data[i].idx); - pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, + prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx, h->data[i].blocks_nonempty, m->nr_blocks - m->nr_redundant, m->nr_redundant); @@ -1631,11 +1622,11 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - pr_buf(out, "target %u algo %u redundancy %u:\n", + prt_printf(out, "target %u algo %u redundancy %u:\n", h->target, h->algo, h->redundancy); if (h->s) - pr_buf(out, "\tpending: blocks %u+%u allocated %u\n", + prt_printf(out, "\tpending: blocks %u+%u allocated %u\n", h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, h->s->nr_data)); @@ -1644,7 +1635,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - pr_buf(out, "\tin flight: blocks %u+%u pin %u\n", + prt_printf(out, "\tin flight: blocks %u+%u pin %u\n", s->nr_data, s->nr_parity, atomic_read(&s->pin)); } @@ -1676,11 +1667,14 @@ void bch2_fs_ec_exit(struct bch_fs *c) bioset_exit(&c->ec_bioset); } -int bch2_fs_ec_init(struct bch_fs *c) +void bch2_fs_ec_init_early(struct bch_fs *c) { INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); +} +int bch2_fs_ec_init(struct bch_fs *c) +{ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 78d468c..3e2b22c 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -4,9 +4,9 @@ #include "ec_types.h" #include "buckets_types.h" -#include "keylist_types.h" -const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, + int rw, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_stripe, \ + .atomic_trigger = bch2_mark_stripe, \ } static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) @@ -163,9 +165,6 @@ struct ec_stripe_new { open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; - struct keylist keys; - u64 inline_keys[BKEY_U64s * 8]; - struct ec_stripe_buf new_stripe; struct ec_stripe_buf existing_stripe; }; @@ -193,8 +192,6 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, - struct bkey *); void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); @@ -221,6 +218,7 @@ void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); +void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); #endif /* _BCACHEFS_EC_H */ diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c new file mode 100644 index 0000000..cc9ce0b --- /dev/null +++ b/libbcachefs/errcode.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "errcode.h" + +#include + +static const char * const bch2_errcode_strs[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, + BCH_ERRCODES() +#undef x + NULL +}; + +#define BCH_ERR_0 0 + +static unsigned bch2_errcode_parents[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, + BCH_ERRCODES() +#undef x +}; + +const char *bch2_err_str(int err) +{ + const char *errstr; + err = abs(err); + + BUG_ON(err >= BCH_ERR_MAX); + + if (err >= BCH_ERR_START) + errstr = bch2_errcode_strs[err - BCH_ERR_START]; + else if (err) + errstr = errname(err); + else + errstr = "(No error)"; + return errstr ?: "(Invalid error)"; +} + +bool __bch2_err_matches(int err, int class) +{ + err = abs(err); + class = abs(class); + + BUG_ON(err >= BCH_ERR_MAX); + BUG_ON(class >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && err != class) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return err == class; +} + +int __bch2_err_class(int err) +{ + err = -err; + BUG_ON((unsigned) err >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return -err; +} diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index f7d1291..9f29304 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -2,11 +2,96 @@ #ifndef _BCACHEFS_ERRCODE_H #define _BCACHEFS_ERRCODE_H -enum { - /* Bucket allocator: */ - OPEN_BUCKETS_EMPTY = 2048, - FREELIST_EMPTY, /* Allocator thread not keeping up */ - INSUFFICIENT_DEVICES, +#define BCH_ERRCODES() \ + x(ENOSPC, ENOSPC_disk_reservation) \ + x(ENOSPC, ENOSPC_bucket_alloc) \ + x(ENOSPC, ENOSPC_disk_label_add) \ + x(ENOSPC, ENOSPC_stripe_create) \ + x(ENOSPC, ENOSPC_stripe_reuse) \ + x(ENOSPC, ENOSPC_inode_create) \ + x(ENOSPC, ENOSPC_str_hash_create) \ + x(ENOSPC, ENOSPC_snapshot_create) \ + x(ENOSPC, ENOSPC_subvolume_create) \ + x(ENOSPC, ENOSPC_sb) \ + x(ENOSPC, ENOSPC_sb_journal) \ + x(ENOSPC, ENOSPC_sb_quota) \ + x(ENOSPC, ENOSPC_sb_replicas) \ + x(ENOSPC, ENOSPC_sb_members) \ + x(0, open_buckets_empty) \ + x(0, freelist_empty) \ + x(BCH_ERR_freelist_empty, no_buckets_found) \ + x(0, insufficient_devices) \ + x(0, transaction_restart) \ + x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ + x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ + x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ + x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ + x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ + x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ + x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ + x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ + x(BCH_ERR_no_btree_node, no_btree_node_drop) \ + x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ + x(BCH_ERR_no_btree_node, no_btree_node_up) \ + x(BCH_ERR_no_btree_node, no_btree_node_down) \ + x(BCH_ERR_no_btree_node, no_btree_node_init) \ + x(BCH_ERR_no_btree_node, no_btree_node_cached) \ + x(0, backpointer_to_overwritten_btree_node) \ + x(0, lock_fail_root_changed) \ + x(0, journal_reclaim_would_deadlock) \ + x(0, fsck) \ + x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_ignore) \ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ + x(0, need_snapshot_cleanup) \ + x(0, need_topology_repair) + +enum bch_errcode { + BCH_ERR_START = 2048, +#define x(class, err) BCH_ERR_##err, + BCH_ERRCODES() +#undef x + BCH_ERR_MAX }; +const char *bch2_err_str(int); +bool __bch2_err_matches(int, int); + +static inline bool _bch2_err_matches(int err, int class) +{ + return err && __bch2_err_matches(err, class); +} + +#define bch2_err_matches(_err, _class) \ +({ \ + BUILD_BUG_ON(!__builtin_constant_p(_class)); \ + _bch2_err_matches(_err, _class); \ +}) + +int __bch2_err_class(int); + +static inline long bch2_err_class(long err) +{ + return err < 0 ? __bch2_err_class(err) : err; +} + #endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 8279a9b..2fb5102 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -68,103 +68,138 @@ void bch2_io_error(struct bch_dev *ca) #include "tools-util.h" #endif -enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, - const char *fmt, ...) +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { - struct fsck_err_state *s = NULL; - va_list args; - bool fix = false, print = true, suppressing = false; - char _buf[sizeof(s->buf)], *buf = _buf; - - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - if (c->opts.errors == BCH_ON_ERROR_continue) { - bch_err(c, "fixing"); - return FSCK_ERR_FIX; - } else { - bch2_inconsistent_error(c); - return FSCK_ERR_EXIT; - } - } + struct fsck_err_state *s; - mutex_lock(&c->fsck_error_lock); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return NULL; list_for_each_entry(s, &c->fsck_errors, list) - if (s->fmt == fmt) - goto found; + if (s->fmt == fmt) { + /* + * move it to the head of the list: repeated fsck errors + * are common + */ + list_move(&s->list, &c->fsck_errors); + return s; + } s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) { if (!c->fsck_alloc_err) bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); c->fsck_alloc_err = true; - buf = _buf; - goto print; + return NULL; } INIT_LIST_HEAD(&s->list); s->fmt = fmt; -found: - list_move(&s->list, &c->fsck_errors); - s->nr++; - if (c->opts.ratelimit_errors && - !(flags & FSCK_NO_RATELIMIT) && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - suppressing = true; - else - print = false; + s->buf = PRINTBUF; + list_add(&s->list, &c->fsck_errors); + return s; +} + +int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; + bool print = true, suppressing = false, inconsistent = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + + mutex_lock(&c->fsck_error_lock); + s = fsck_err_get(c, fmt); + if (s) { + if (c->opts.ratelimit_errors && + !(flags & FSCK_NO_RATELIMIT) && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } + + printbuf_reset(&s->buf); + out = &s->buf; + s->nr++; } - buf = s->buf; -print: + + if (!strncmp(fmt, "bcachefs:", 9)) + prt_printf(out, bch2_log_msg(c, "")); + va_start(args, fmt); - vscnprintf(buf, sizeof(_buf), fmt, args); + prt_vprintf(out, fmt, args); va_end(args); - if (c->opts.fix_errors == FSCK_OPT_EXIT) { - bch_err(c, "%s, exiting", buf); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str(out, ", shutting down"); + inconsistent = true; + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + } else if (c->opts.fix_errors == FSCK_OPT_EXIT) { + prt_str(out, ", exiting"); + ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { if (c->opts.fix_errors == FSCK_OPT_ASK) { - printk(KERN_ERR "%s: fix?", buf); - fix = ask_yn(); + prt_str(out, ": fix?"); + bch2_print_string_as_lines(KERN_ERR, out->buf); + print = false; + ret = ask_yn() + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; } else if (c->opts.fix_errors == FSCK_OPT_YES || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - if (print) - bch_err(c, "%s, fixing", buf); - fix = true; + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; } else { - if (print) - bch_err(c, "%s, not fixing", buf); - fix = false; + prt_str(out, ", not fixing"); } } else if (flags & FSCK_NEED_FSCK) { - if (print) - bch_err(c, "%s (run fsck to correct)", buf); + prt_str(out, " (run fsck to correct)"); } else { - if (print) - bch_err(c, "%s (repair unimplemented)", buf); + prt_str(out, " (repair unimplemented)"); } - if (suppressing) + if (ret == -BCH_ERR_fsck_ignore && + (c->opts.fix_errors == FSCK_OPT_EXIT || + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + + if (print) + bch2_print_string_as_lines(KERN_ERR, out->buf); + + if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) + bch_err(c, "Unable to continue, halting"); + else if (suppressing) bch_err(c, "Ratelimiting new instances of previous error"); mutex_unlock(&c->fsck_error_lock); - if (fix) { + printbuf_exit(&buf); + + if (inconsistent) + bch2_inconsistent_error(c); + + if (ret == -BCH_ERR_fsck_fix) { set_bit(BCH_FS_ERRORS_FIXED, &c->flags); - return FSCK_ERR_FIX; } else { set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); set_bit(BCH_FS_ERROR, &c->flags); - return c->opts.fix_errors == FSCK_OPT_EXIT || - !(flags & FSCK_CAN_IGNORE) - ? FSCK_ERR_EXIT - : FSCK_ERR_IGNORE; } + + return ret; } void bch2_flush_fsck_errs(struct bch_fs *c) @@ -175,9 +210,10 @@ void bch2_flush_fsck_errs(struct bch_fs *c) list_for_each_entry_safe(s, n, &c->fsck_errors, list) { if (s->ratelimited) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf.buf); list_del(&s->list); + printbuf_exit(&s->buf); kfree(s); } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 9869382..bbf9b6d 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent_on(cond, c, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_fs_inconsistent(c, __VA_ARGS__); \ @@ -59,26 +59,38 @@ do { \ #define bch2_dev_inconsistent_on(cond, ca, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_dev_inconsistent(ca, __VA_ARGS__); \ _ret; \ }) +/* + * When a transaction update discovers or is causing a fs inconsistency, it's + * helpful to also dump the pending updates: + */ +#define bch2_trans_inconsistent(trans, ...) \ +({ \ + bch_err(trans->c, __VA_ARGS__); \ + bch2_inconsistent_error(trans->c); \ + bch2_dump_trans_updates(trans); \ +}) + +#define bch2_trans_inconsistent_on(cond, trans, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_trans_inconsistent(trans, __VA_ARGS__); \ + _ret; \ +}) + /* * Fsck errors: inconsistency errors we detect at mount time, and should ideally * be able to repair: */ -enum { - BCH_FSCK_OK = 0, - BCH_FSCK_ERRORS_NOT_FIXED = 1, - BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, - BCH_FSCK_REPAIR_IMPOSSIBLE = 3, - BCH_FSCK_UNKNOWN_VERSION = 4, -}; - enum fsck_err_opts { FSCK_OPT_EXIT, FSCK_OPT_YES, @@ -86,19 +98,12 @@ enum fsck_err_opts { FSCK_OPT_ASK, }; -enum fsck_err_ret { - FSCK_ERR_IGNORE = 0, - FSCK_ERR_FIX = 1, - FSCK_ERR_EXIT = 2, - FSCK_ERR_START_TOPOLOGY_REPAIR = 3, -}; - struct fsck_err_state { struct list_head list; const char *fmt; u64 nr; bool ratelimited; - char buf[512]; + struct printbuf buf; }; #define FSCK_CAN_FIX (1 << 0) @@ -107,21 +112,20 @@ struct fsck_err_state { #define FSCK_NO_RATELIMIT (1 << 3) __printf(3, 4) __cold -enum fsck_err_ret bch2_fsck_err(struct bch_fs *, - unsigned, const char *, ...); +int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); void bch2_flush_fsck_errs(struct bch_fs *); #define __fsck_err(c, _flags, msg, ...) \ ({ \ - int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ + int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ \ - if (_fix == FSCK_ERR_EXIT) { \ - bch_err(c, "Unable to continue, halting"); \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) { \ + ret = _ret; \ goto fsck_err; \ } \ \ - _fix; \ + _ret == -BCH_ERR_fsck_fix; \ }) /* These macros return true if error should be fixed: */ @@ -129,7 +133,7 @@ void bch2_flush_fsck_errs(struct bch_fs *); /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ #define __fsck_err_on(cond, c, _flags, ...) \ - ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) + (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) #define need_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) @@ -164,7 +168,7 @@ do { \ #define bch2_fs_fatal_err_on(cond, c, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_fs_fatal_error(c, __VA_ARGS__); \ diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 58b2c96..2fd5d96 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - unsigned ret = 0; + unsigned ret = 0, lru = 0; bkey_extent_entry_for_each(ptrs, entry) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: + /* Might also be updating LRU btree */ + if (entry->ptr.cached) + lru++; + + fallthrough; case BCH_EXTENT_ENTRY_stripe_ptr: ret++; } } - return ret; + /* + * Updating keys in the alloc btree may also update keys in the + * freespace or discard btrees: + */ + return lru + ret * 2; } static int count_iters_for_insert(struct btree_trans *trans, diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 44c584e..9e2a4ed 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -26,6 +26,8 @@ #include +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -156,12 +158,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) - return "value too big"; + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + return -EINVAL; + } - return bch2_bkey_ptrs_invalid(c, k); + return bch2_bkey_ptrs_invalid(c, k, rw, err); } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -170,35 +176,45 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) - return "value too small"; + if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { + prt_printf(err, "value too small (%zu <= %zu)", + bkey_val_bytes(k.k), sizeof(*bp.v)); + return -EINVAL; + } - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + return -EINVAL; + } if (c->sb.version < bcachefs_metadata_version_snapshot && - bp.v->min_key.snapshot) - return "invalid min_key.snapshot"; + bp.v->min_key.snapshot) { + prt_printf(err, "invalid min_key.snapshot (%u != 0)", + bp.v->min_key.snapshot); + return -EINVAL; + } - return bch2_bkey_ptrs_invalid(c, k); + return bch2_bkey_ptrs_invalid(c, k, rw, err); } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) + struct bkey_s_c k) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx written %u min_key %s", + prt_printf(out, "seq %llx written %u min_key %s", le64_to_cpu(bp.v->seq), le16_to_cpu(bp.v->sectors_written), BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); bch2_bpos_to_text(out, bp.v->min_key); - pr_buf(out, " "); + prt_printf(out, " "); bch2_bkey_ptrs_to_text(out, c, k); } @@ -220,17 +236,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, /* KEY_TYPE_extent: */ -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); @@ -287,7 +292,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= lp.crc.uncompressed_size) { /* can use left extent's crc entry */ - } else if (lp.crc.live_size <= rp.crc.offset ) { + } else if (lp.crc.live_size <= rp.crc.offset) { /* can use right extent's crc entry */ } else { /* check if checksums can be merged: */ @@ -305,8 +310,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc.uncompressed_size + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) return false; } @@ -334,7 +351,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (crc_l.offset + crc_l.live_size + crc_r.live_size <= crc_l.uncompressed_size) { /* can use left extent's crc entry */ - } else if (crc_l.live_size <= crc_r.offset ) { + } else if (crc_l.live_size <= crc_r.offset) { /* can use right extent's crc entry */ crc_r.offset -= crc_l.live_size; bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, @@ -363,17 +380,24 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(*r.v)); + return -EINVAL; + } - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { + prt_printf(err, "invalid nr_replicas (%u)", + r.v->nr_replicas); + return -EINVAL; + } - return NULL; + return 0; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -381,7 +405,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - pr_buf(out, "generation %u replicas %u", + prt_printf(out, "generation %u replicas %u", le32_to_cpu(r.v->generation), r.v->nr_replicas); } @@ -666,37 +690,6 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) return durability; } -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); @@ -800,8 +793,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; @@ -873,6 +866,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); + + if (ptr) + __bch2_bkey_drop_ptr(k, ptr); +} + const struct bch_extent_ptr * bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) { @@ -917,6 +918,44 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, return false; } +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; +} + +bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, + struct bkey_s_c k2) +{ + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; +} + /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -949,27 +988,37 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bkey_extent_entry_for_each(ptrs, entry) { if (!first) - pr_buf(out, " "); + prt_printf(out, " "); switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, + b, offset, ptr->gen, + ptr->cached ? " cached" : ""); + + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, @@ -979,11 +1028,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, case BCH_EXTENT_ENTRY_stripe_ptr: ec = &entry->stripe_ptr; - pr_buf(out, "ec: idx %llu block %u", + prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; default: - pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; } @@ -991,69 +1040,88 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; struct bch_dev *ca; - if (!bch2_dev_exists2(c, ptr->dev)) - return "pointer to invalid device"; + if (!bch2_dev_exists2(c, ptr->dev)) { + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -EINVAL; + } ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; - bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; + if (ptr != ptr2 && ptr->dev == ptr2->dev) { + prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); + return -EINVAL; + } - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; + if (bucket >= ca->mi.nbuckets) { + prt_printf(err, "pointer past last bucket (%llu > %llu)", + bucket, ca->mi.nbuckets); + return -EINVAL; + } - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { + prt_printf(err, "pointer before first bucket (%llu < %u)", + bucket, ca->mi.first_bucket); + return -EINVAL; + } - return NULL; + if (bucket_offset + size_ondisk > ca->mi.bucket_size) { + prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); + return -EINVAL; + } + + return 0; } -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_devs_list devs; const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; - const char *reason; unsigned nonce = UINT_MAX; - unsigned i; + unsigned nr_ptrs = 0; + int ret; - if (k.k->type == KEY_TYPE_btree_ptr || - k.k->type == KEY_TYPE_btree_ptr_v2) + if (bkey_is_btree_ptr(k.k)) size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { + prt_printf(err, "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + return -EINVAL; + } - if (k.k->type == KEY_TYPE_btree_ptr && - !extent_entry_is_ptr(entry)) - return "has non ptr field"; + if (bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry)) { + prt_printf(err, "has non ptr field"); + return -EINVAL; + } switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - reason = extent_ptr_invalid(c, k, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, + false, err); + if (ret) + return ret; + nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: @@ -1061,22 +1129,30 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); if (crc.offset + crc.live_size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; + crc.uncompressed_size) { + prt_printf(err, "checksum offset + key size > uncompressed size"); + return -EINVAL; + } size_ondisk = crc.compressed_size; - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; + if (!bch2_checksum_type_valid(c, crc.csum_type)) { + prt_printf(err, "invalid checksum type"); + return -EINVAL; + } - if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) - return "invalid compression type"; + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { + prt_printf(err, "invalid compression type"); + return -EINVAL; + } if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; + else if (nonce != crc.offset + crc.nonce) { + prt_printf(err, "incorrect nonce"); + return -EINVAL; + } } break; case BCH_EXTENT_ENTRY_stripe_ptr: @@ -1084,13 +1160,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) } } - devs = bch2_bkey_devs(k); - bubble_sort(devs.devs, devs.nr, u8_cmp); - for (i = 0; i + 1 < devs.nr; i++) - if (devs.devs[i] == devs.devs[i + 1]) - return "multiple ptrs to same device"; + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -EINVAL; + } - return NULL; + return 0; } void bch2_ptr_swab(struct bkey_s k) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 9c25672..3c17b81 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -367,13 +367,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -381,6 +380,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .key_invalid = bch2_btree_ptr_invalid, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ @@ -388,25 +389,28 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } /* KEY_TYPE_extent: */ -const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent (struct bkey_ops) { \ - .key_invalid = bch2_extent_invalid, \ - .val_to_text = bch2_extent_to_text, \ + .key_invalid = bch2_bkey_ptrs_invalid, \ + .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } /* KEY_TYPE_reservation: */ -const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -414,6 +418,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_invalid = bch2_reservation_invalid, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ + .trans_trigger = bch2_trans_mark_reservation, \ + .atomic_trigger = bch2_mark_reservation, \ } /* Extent checksum entries: */ @@ -571,15 +577,10 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, - unsigned, unsigned); - void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -601,16 +602,20 @@ do { \ } while (0) void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); +bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); +bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); void bch2_ptr_swab(struct bkey_s); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index d543480..1f2e1fc 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -204,12 +204,19 @@ int bch2_link_trans(struct btree_trans *trans, goto err; inode_u->bi_ctime = now; - bch2_inode_nlink_inc(inode_u); + ret = bch2_inode_nlink_inc(inode_u); + if (ret) + return ret; ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; + if (bch2_reinherit_attrs(inode_u, dir_u)) { + ret = -EXDEV; + goto err; + } + dir_u->bi_mtime = dir_u->bi_ctime = now; dir_hash = bch2_hash_info_init(c, dir_u); @@ -297,7 +304,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; } else { - bch2_inode_nlink_dec(inode_u); + bch2_inode_nlink_dec(trans, inode_u); } if (inode_u->bi_dir == dirent_iter.pos.inode && @@ -462,7 +469,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (mode == BCH_RENAME_OVERWRITE) - bch2_inode_nlink_dec(dst_inode_u); + bch2_inode_nlink_dec(trans, dst_inode_u); src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; @@ -480,11 +487,11 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: (src_dir.inum != dst_dir.inum ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) - : 0 ) ?: + : 0) ?: bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: (dst_inum.inum ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) - : 0 ); + : 0); err: bch2_trans_iter_exit(trans, &dst_inode_iter); bch2_trans_iter_exit(trans, &src_inode_iter); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1d0871f..706180b 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -35,6 +35,15 @@ #include #include +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + static inline struct address_space *faults_disabled_mapping(void) { return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); @@ -142,7 +151,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c, static int bch2_quota_reservation_add(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *res, - unsigned sectors, + u64 sectors, bool check_enospc) { int ret; @@ -223,7 +232,10 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, return; mutex_lock(&inode->ei_quota_lock); - BUG_ON((s64) inode->v.i_blocks + sectors < 0); + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA @@ -397,7 +409,7 @@ retry: offset = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -422,22 +434,20 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode, { pgoff_t index = start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct pagevec pvec; + struct folio_batch fbatch; + unsigned i, j; if (end <= start) return; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - do { - unsigned nr_pages, i, j; - - nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, - &index, end_index); - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - u64 pg_start = page->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; unsigned pg_offset = max(start, pg_start) - pg_start; unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; struct bch_page_state *s; @@ -446,8 +456,8 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode, BUG_ON(pg_offset >= PAGE_SECTORS); BUG_ON(pg_offset + pg_len > PAGE_SECTORS); - lock_page(page); - s = bch2_page_state(page); + folio_lock(folio); + s = bch2_page_state(&folio->page); if (s) { spin_lock(&s->lock); @@ -456,10 +466,11 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode, spin_unlock(&s->lock); } - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); - } while (index <= end_index); + folio_batch_release(&fbatch); + cond_resched(); + } } static void mark_pagecache_reserved(struct bch_inode_info *inode, @@ -468,23 +479,21 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; pgoff_t index = start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct pagevec pvec; + struct folio_batch fbatch; s64 i_sectors_delta = 0; + unsigned i, j; if (end <= start) return; - pagevec_init(&pvec); - - do { - unsigned nr_pages, i, j; + folio_batch_init(&fbatch); - nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, - &index, end_index); - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - u64 pg_start = page->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; unsigned pg_offset = max(start, pg_start) - pg_start; unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; struct bch_page_state *s; @@ -493,8 +502,8 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode, BUG_ON(pg_offset >= PAGE_SECTORS); BUG_ON(pg_offset + pg_len > PAGE_SECTORS); - lock_page(page); - s = bch2_page_state(page); + folio_lock(folio); + s = bch2_page_state(&folio->page); if (s) { spin_lock(&s->lock); @@ -513,10 +522,11 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode, spin_unlock(&s->lock); } - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); - } while (index <= end_index); + folio_batch_release(&fbatch); + cond_resched(); + } i_sectors_acct(c, inode, NULL, i_sectors_delta); } @@ -596,7 +606,7 @@ static void bch2_page_reservation_put(struct bch_fs *c, static int bch2_page_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct page *page, struct bch2_page_reservation *res, - unsigned offset, unsigned len, bool check_enospc) + unsigned offset, unsigned len) { struct bch_page_state *s = bch2_page_state_create(page, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; @@ -616,19 +626,14 @@ static int bch2_page_reservation_get(struct bch_fs *c, } if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, - disk_sectors, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); if (unlikely(ret)) return ret; } if (quota_sectors) { ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, - check_enospc); + quota_sectors, true); if (unlikely(ret)) { struct disk_reservation tmp = { .sectors = disk_sectors @@ -812,7 +817,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) } } - if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { + if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) { unlock_page(page); ret = VM_FAULT_SIGBUS; goto out; @@ -830,47 +835,22 @@ out: return ret; } -void bch2_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - if (offset || length < PAGE_SIZE) + if (offset || length < folio_size(folio)) return; - bch2_clear_page_bits(page); + bch2_clear_page_bits(&folio->page); } -int bch2_releasepage(struct page *page, gfp_t gfp_mask) +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) { - if (PageDirty(page)) - return 0; - - bch2_clear_page_bits(page); - return 1; -} - -#ifdef CONFIG_MIGRATION -int bch2_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - int ret; - - EBUG_ON(!PageLocked(page)); - EBUG_ON(!PageLocked(newpage)); - - ret = migrate_page_move_mapping(mapping, newpage, page, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; - if (PagePrivate(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - return MIGRATEPAGE_SUCCESS; + bch2_clear_page_bits(&folio->page); + return true; } -#endif /* readpage(s): */ @@ -1034,10 +1014,9 @@ retry: * read_extent -> io_time_reset may cause a transaction restart * without returning an error, we need to check for that here: */ - if (!bch2_trans_relock(trans)) { - ret = -EINTR; + ret = bch2_trans_relock(trans); + if (ret) break; - } bch2_btree_iter_set_pos(&iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); @@ -1090,7 +1069,7 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (ret) { @@ -1127,12 +1106,12 @@ void bch2_readahead(struct readahead_control *ractl) readpages_iter.idx, BIO_MAX_VECS); struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_NOFS, &c->bio_read), opts); readpages_iter.idx++; - bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); @@ -1164,20 +1143,6 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bch2_trans_exit(&trans); } -int bch2_readpage(struct file *file, struct page *page) -{ - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, &inode->ei_inode); - struct bch_read_bio *rbio; - - rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); - rbio->bio.bi_end_io = bch2_readpages_end_io; - - __bchfs_readpage(c, rbio, inode_inum(inode), page); - return 0; -} - static void bch2_read_single_page_end_io(struct bio *bio) { complete(bio->bi_private); @@ -1192,7 +1157,7 @@ static int bch2_read_single_page(struct page *page, int ret; DECLARE_COMPLETION_ONSTACK(done); - rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), io_opts(c, &inode->ei_inode)); rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; @@ -1210,6 +1175,16 @@ static int bch2_read_single_page(struct page *page, return 0; } +int bch2_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + int ret; + + ret = bch2_read_single_page(page, page->mapping); + folio_unlock(folio); + return bch2_err_class(ret); +} + /* writepages: */ struct bch_writepage_state { @@ -1243,8 +1218,6 @@ static void bch2_writepage_io_done(struct closure *cl) struct bio_vec *bvec; unsigned i; - up(&io->op.c->io_in_flight); - if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); @@ -1278,7 +1251,7 @@ static void bch2_writepage_io_done(struct closure *cl) * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ - WARN_ON(io->op.i_sectors_delta > 0); + WARN_ON_ONCE(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up @@ -1307,8 +1280,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) { struct bch_writepage_io *io = w->io; - down(&io->op.c->io_in_flight); - w->io = NULL; closure_call(&io->op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); @@ -1327,7 +1298,9 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, { struct bch_write_op *op; - w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, + REQ_OP_WRITE, + GFP_NOFS, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); @@ -1464,8 +1437,8 @@ do_io: sectors << 9, offset << 9)); /* Check for writing past i_size: */ - WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c))); + WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c))); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; @@ -1493,27 +1466,13 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc if (w.io) bch2_writepage_do_io(&w); blk_finish_plug(&plug); - return ret; -} - -int bch2_writepage(struct page *page, struct writeback_control *wbc) -{ - struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); - int ret; - - ret = __bch2_writepage(page, wbc, &w); - if (w.io) - bch2_writepage_do_io(&w); - - return ret; + return bch2_err_class(ret); } /* buffered writes: */ int bch2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); @@ -1533,7 +1492,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(&inode->ei_pagecache_lock); - page = grab_cache_page_write_begin(mapping, index, flags); + page = grab_cache_page_write_begin(mapping, index); if (!page) goto err_unlock; @@ -1563,11 +1522,10 @@ out: if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); if (ret) - goto out; + goto err; } - ret = bch2_page_reservation_get(c, inode, page, res, - offset, len, true); + ret = bch2_page_reservation_get(c, inode, page, res, offset, len); if (ret) { if (!PageUptodate(page)) { /* @@ -1592,7 +1550,7 @@ err_unlock: bch2_pagecache_add_put(&inode->ei_pagecache_lock); kfree(res); *fsdata = NULL; - return ret; + return bch2_err_class(ret); } int bch2_write_end(struct file *file, struct address_space *mapping, @@ -1664,7 +1622,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, bch2_page_reservation_init(c, inode, &res); for (i = 0; i < nr_pages; i++) { - pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); + pages[i] = grab_cache_page_write_begin(mapping, index + i); if (!pages[i]) { nr_pages = i; if (!i) { @@ -1709,7 +1667,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len, true); + pg_offset, pg_len); if (ret) goto out; @@ -1726,7 +1684,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, unsigned pg_len = min_t(unsigned, len - copied, PAGE_SIZE - pg_offset); unsigned pg_copied = copy_page_from_iter_atomic(page, - pg_offset, pg_len,iter); + pg_offset, pg_len, iter); if (!pg_copied) break; @@ -1808,11 +1766,11 @@ again: * to check that the address is actually valid, when atomic * usercopies are used, below. */ - if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { bytes = min_t(unsigned long, iov_iter_count(iter), PAGE_SIZE - offset); - if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ret = -EFAULT; break; } @@ -1870,7 +1828,7 @@ static void bch2_dio_read_complete(struct closure *cl) { struct dio_read *dio = container_of(cl, struct dio_read, cl); - dio->req->ki_complete(dio->req, dio->ret, 0); + dio->req->ki_complete(dio->req, dio->ret); bio_check_or_release(&dio->rbio.bio, dio->should_dirty); } @@ -1918,8 +1876,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); iter->count -= shorten; - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_VECS), + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -1953,8 +1913,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_VECS), + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: @@ -2012,7 +1974,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos, iocb->ki_pos + count - 1); if (ret < 0) - return ret; + goto out; file_accessed(file); @@ -2027,8 +1989,8 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = generic_file_read_iter(iocb, iter); bch2_pagecache_add_put(&inode->ei_pagecache_lock); } - - return ret; +out: + return bch2_err_class(ret); } /* O_DIRECT writes */ @@ -2070,7 +2032,7 @@ retry: offset = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); err: - if (err == -EINTR) + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -2096,12 +2058,10 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (dio->loop) goto loop; - down(&c->io_in_flight); - while (1) { iter_count = dio->iter.count; - if (kthread) + if (kthread && dio->mm) kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -2111,7 +2071,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) dropped_locks = fdm_dropped_locks(); current->faults_disabled_mapping = NULL; - if (kthread) + if (kthread && dio->mm) kthread_unuse_mm(dio->mm); /* @@ -2177,8 +2137,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct iovec *iov = dio->inline_vecs; if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), - GFP_KERNEL); + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); if (unlikely(!iov)) { dio->sync = sync = true; goto do_io; @@ -2222,13 +2182,12 @@ loop: if (!dio->iter.count) break; - bio_reset(bio); + bio_reset(bio, NULL, REQ_OP_WRITE); reinit_completion(&dio->done); } ret = dio->op.error ?: ((long) dio->written << 9); err: - up(&c->io_in_flight); bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_quota_reservation_put(c, inode, &dio->quota_res); @@ -2243,8 +2202,11 @@ err: /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); + if (ret < 0) + ret = bch2_err_class(ret); + if (!sync) { - req->ki_complete(req, ret, 0); + req->ki_complete(req, ret); ret = -EIOCBQUEUED; } return ret; @@ -2303,10 +2265,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) locked = false; } - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_is_bvec(iter) - ? 0 - : iov_iter_npages(iter, BIO_MAX_VECS), + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_WRITE, + GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); init_completion(&dio->done); @@ -2349,8 +2311,10 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) struct bch_inode_info *inode = file_bch_inode(file); ssize_t ret; - if (iocb->ki_flags & IOCB_DIRECT) - return bch2_direct_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(&inode->v); @@ -2377,8 +2341,8 @@ unlock: if (ret > 0) ret = generic_write_sync(iocb, ret); - - return ret; +out: + return bch2_err_class(ret); } /* fsync: */ @@ -2412,7 +2376,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret2 = sync_inode_metadata(&inode->v, 1); ret3 = bch2_flush_inode(c, inode_inum(inode)); - return ret ?: ret2 ?: ret3; + return bch2_err_class(ret ?: ret2 ?: ret3); } /* truncate: */ @@ -2446,7 +2410,7 @@ retry: start = iter.pos; bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -2703,9 +2667,11 @@ int bch2_truncate(struct user_namespace *mnt_userns, U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); - WARN_ON(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal)); - + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal), c, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); if (unlikely(ret)) goto err; @@ -2716,7 +2682,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); - return ret; + return bch2_err_class(ret); } /* fallocate: */ @@ -2747,7 +2713,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len truncate_pagecache_range(&inode->v, offset, end - 1); - if (block_start < block_end ) { + if (block_start < block_end) { s64 i_sectors_delta = 0; ret = bch2_fpunch(c, inode_inum(inode), @@ -2834,7 +2800,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, bch2_trans_copy_iter(&dst, &src); bch2_trans_copy_iter(&del, &src); - while (ret == 0 || ret == -EINTR) { + while (ret == 0 || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; @@ -2902,13 +2869,7 @@ reassemble: next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - if (copy.k->k.size == k.k->size) { - /* - * If we're moving the entire extent, we can skip - * running triggers: - */ - trigger_flags |= BTREE_TRIGGER_NORUN; - } else { + if (copy.k->k.size != k.k->size) { /* We might end up splitting compressed extents: */ unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); @@ -3042,14 +3003,14 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bkey_err: bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); - if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; @@ -3100,7 +3061,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, * so that the VFS cache i_size is consistent with the btree i_size: */ if (ret && - !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) + !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) return ret; if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) @@ -3128,13 +3089,17 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return -EROFS; inode_lock(&inode->v); inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); + ret = file_modified(file); + if (ret) + goto err; + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -3145,13 +3110,61 @@ long bch2_fallocate_dispatch(struct file *file, int mode, ret = bchfs_fcollapse_finsert(inode, offset, len, false); else ret = -EOPNOTSUPP; - - +err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); percpu_ref_put(&c->writes); - return ret; + return bch2_err_class(ret); +} + +static int quota_reserve_range(struct bch_inode_info *inode, + struct quota_res *res, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot; + u64 sectors = end - start; + u64 pos = start; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, pos, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(&trans)) && + (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && + !(ret = bkey_err(k))) { + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } + bch2_btree_iter_advance(&iter); + } + pos = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + + if (ret) + return ret; + + return bch2_quota_reservation_add(c, inode, res, sectors, true); } loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, @@ -3161,6 +3174,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct bch_inode_info *src = file_bch_inode(file_src); struct bch_inode_info *dst = file_bch_inode(file_dst); struct bch_fs *c = src->v.i_sb->s_fs_info; + struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; u64 aligned_len; loff_t ret = 0; @@ -3181,8 +3195,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); - file_update_time(file_dst); - inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -3199,6 +3211,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; + ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, + (pos_dst + aligned_len) >> 9); + if (ret) + goto err; + + file_update_time(file_dst); + mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); @@ -3215,8 +3234,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, */ ret = min((u64) ret << 9, (u64) len); - /* XXX get a quota reservation */ - i_sectors_acct(c, dst, NULL, i_sectors_delta); + i_sectors_acct(c, dst, "a_res, i_sectors_delta); spin_lock(&dst->v.i_lock); if (pos_dst + ret > dst->v.i_size) @@ -3227,9 +3245,10 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, IS_SYNC(file_inode(file_dst))) ret = bch2_flush_inode(c, inode_inum(dst)); err: + bch2_quota_reservation_put(c, dst, "a_res); bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); - return ret; + return bch2_err_class(ret); } /* fseek: */ @@ -3251,36 +3270,40 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode, loff_t start_offset, loff_t end_offset) { - struct address_space *mapping = vinode->i_mapping; - struct page *page; + struct folio_batch fbatch; pgoff_t start_index = start_offset >> PAGE_SHIFT; pgoff_t end_index = end_offset >> PAGE_SHIFT; pgoff_t index = start_index; + unsigned i; loff_t ret; int offset; - while (index <= end_index) { - if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { - lock_page(page); + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - offset = page_data_offset(page, - page->index == start_index + folio_lock(folio); + + offset = page_data_offset(&folio->page, + folio->index == start_index ? start_offset & (PAGE_SIZE - 1) : 0); if (offset >= 0) { - ret = clamp(((loff_t) page->index << PAGE_SHIFT) + + ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + offset, start_offset, end_offset); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_batch_release(&fbatch); return ret; } - unlock_page(page); - put_page(page); - } else { - break; + folio_unlock(folio); } + folio_batch_release(&fbatch); + cond_resched(); } return end_offset; @@ -3322,7 +3345,7 @@ retry: } bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -3437,7 +3460,7 @@ retry: } bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -3452,18 +3475,26 @@ err: loff_t bch2_llseek(struct file *file, loff_t offset, int whence) { + loff_t ret; + switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: - return generic_file_llseek(file, offset, whence); + ret = generic_file_llseek(file, offset, whence); + break; case SEEK_DATA: - return bch2_seek_data(file, offset); + ret = bch2_seek_data(file, offset); + break; case SEEK_HOLE: - return bch2_seek_hole(file, offset); + ret = bch2_seek_hole(file, offset); + break; + default: + ret = -EINVAL; + break; } - return -EINVAL; + return bch2_err_class(ret); } void bch2_fs_fsio_exit(struct bch_fs *c) diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index b24efea..a883529 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -15,14 +15,13 @@ int __must_check bch2_write_inode_size(struct bch_fs *, struct bch_inode_info *, loff_t, unsigned); -int bch2_writepage(struct page *, struct writeback_control *); -int bch2_readpage(struct file *, struct page *); +int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page **, void **); + unsigned, struct page **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); @@ -42,10 +41,8 @@ loff_t bch2_llseek(struct file *, loff_t, int); vm_fault_t bch2_page_fault(struct vm_fault *); vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidatepage(struct page *, unsigned int, unsigned int); -int bch2_releasepage(struct page *, gfp_t); -int bch2_migrate_page(struct address_space *, struct page *, - struct page *, enum migrate_mode); +void bch2_invalidate_folio(struct folio *, size_t, size_t); +bool bch2_release_folio(struct folio *, gfp_t); void bch2_fs_fsio_exit(struct bch_fs *); int bch2_fs_fsio_init(struct bch_fs *); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 9f329a6..2bb6808 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -26,6 +26,9 @@ struct flags_set { unsigned flags; unsigned projid; + + bool set_projinherit; + bool projinherit; }; static int bch2_inode_flags_set(struct bch_inode_info *inode, @@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) return -EINVAL; + if (s->set_projinherit) { + bi->bi_fields_set &= ~(1 << Inode_opt_project); + bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); + } + bi->bi_flags &= ~s->mask; bi->bi_flags |= newflags; @@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, struct fsxattr fa = { 0 }; fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; return copy_to_user(arg, &fa, sizeof(fa)); @@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; + s.set_projinherit = true; + s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; + fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); if (fa.fsx_xflags) return -EOPNOTSUPP; @@ -455,51 +471,67 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; switch (cmd) { case FS_IOC_GETFLAGS: - return bch2_ioc_getflags(inode, (int __user *) arg); + ret = bch2_ioc_getflags(inode, (int __user *) arg); + break; case FS_IOC_SETFLAGS: - return bch2_ioc_setflags(c, file, inode, (int __user *) arg); + ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); + break; case FS_IOC_FSGETXATTR: - return bch2_ioc_fsgetxattr(inode, (void __user *) arg); + ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); + break; + case FS_IOC_FSSETXATTR: - return bch2_ioc_fssetxattr(c, file, inode, - (void __user *) arg); + ret = bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + break; case BCHFS_IOC_REINHERIT_ATTRS: - return bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); + ret = bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); + break; case FS_IOC_GETVERSION: - return -ENOTTY; + ret = -ENOTTY; + break; + case FS_IOC_SETVERSION: - return -ENOTTY; + ret = -ENOTTY; + break; case FS_IOC_GOINGDOWN: - return bch2_ioc_goingdown(c, (u32 __user *) arg); + ret = bch2_ioc_goingdown(c, (u32 __user *) arg); + break; case BCH_IOCTL_SUBVOLUME_CREATE: { struct bch_ioctl_subvolume i; - if (copy_from_user(&i, (void __user *) arg, sizeof(i))) - return -EFAULT; - return bch2_ioctl_subvolume_create(c, file, i); + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_create(c, file, i); + break; } case BCH_IOCTL_SUBVOLUME_DESTROY: { struct bch_ioctl_subvolume i; - if (copy_from_user(&i, (void __user *) arg, sizeof(i))) - return -EFAULT; - return bch2_ioctl_subvolume_destroy(c, file, i); + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_destroy(c, file, i); + break; } default: - return bch2_fs_ioctl(c, cmd, (void __user *) arg); + ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); + break; } + + return bch2_err_class(ret); } #ifdef CONFIG_COMPAT diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 91fa189..186faa5 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -8,6 +8,7 @@ #include "buckets.h" #include "chardev.h" #include "dirent.h" +#include "errcode.h" #include "extents.h" #include "fs.h" #include "fs-common.h" @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -152,7 +154,7 @@ retry: bch2_trans_iter_exit(&trans, &iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -322,7 +324,7 @@ retry: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); err_before_quota: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; goto err_trans; } @@ -417,7 +419,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) - return PTR_ERR(inode); + return bch2_err_class(PTR_ERR(inode)); d_instantiate(dentry, &inode->v); return 0; @@ -442,7 +444,7 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_link_trans(&trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, @@ -491,7 +493,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, NULL, + ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, inode_inum(dir), &dir_u, @@ -526,8 +528,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns, inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); inode_lock(&inode->v); ret = page_symlink(&inode->v, symname, strlen(symname) + 1); @@ -613,7 +615,7 @@ static int bch2_rename2(struct user_namespace *mnt_userns, goto err; } - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_rename_trans(&trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, @@ -753,7 +755,7 @@ retry: btree_err: bch2_trans_iter_exit(&trans, &inode_iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err_trans; @@ -767,7 +769,7 @@ err_trans: err: mutex_unlock(&inode->ei_update_lock); - return ret; + return bch2_err_class(ret); } static int bch2_getattr(struct user_namespace *mnt_userns, @@ -836,7 +838,7 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) - return PTR_ERR(inode); + return bch2_err_class(PTR_ERR(inode)); d_mark_tmpfile(dentry, &inode->v); d_instantiate(dentry, &inode->v); @@ -933,9 +935,9 @@ retry: bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, SPOS(ei->v.i_ino, start, snapshot), 0); - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = bkey_err(k)) && - bkey_cmp(iter.pos, end) < 0) { + while (!(ret = btree_trans_too_many_iters(&trans)) && + (k = bch2_btree_iter_peek_upto(&iter, end)).k && + !(ret = bkey_err(k))) { enum btree_id data_btree = BTREE_ID_extents; if (!bkey_extent_is_data(k.k) && @@ -984,7 +986,7 @@ retry: start = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret && have_extent) @@ -1111,18 +1113,17 @@ static const struct inode_operations bch_special_inode_operations = { }; static const struct address_space_operations bch_address_space_operations = { - .writepage = bch2_writepage, - .readpage = bch2_readpage, + .read_folio = bch2_read_folio, .writepages = bch2_writepages, .readahead = bch2_readahead, - .set_page_dirty = __set_page_dirty_nobuffers, + .dirty_folio = filemap_dirty_folio, .write_begin = bch2_write_begin, .write_end = bch2_write_end, - .invalidatepage = bch2_invalidatepage, - .releasepage = bch2_releasepage, + .invalidate_folio = bch2_invalidate_folio, + .release_folio = bch2_release_folio, .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION - .migratepage = bch2_migrate_page, + .migrate_folio = filemap_migrate_folio, #endif .error_remove_page = generic_error_remove_page, }; @@ -1335,7 +1336,7 @@ found: memcpy(name, d.v->d_name, name_len); name[name_len] = '\0'; err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(&trans, &iter1); @@ -1452,7 +1453,7 @@ static int bch2_vfs_write_inode(struct inode *vinode, ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; + return bch2_err_class(ret); } static void bch2_evict_inode(struct inode *vinode) @@ -1476,7 +1477,7 @@ static void bch2_evict_inode(struct inode *vinode) } void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) + snapshot_id_list *s) { struct super_block *sb = c->vfs_sb; struct inode *inode; @@ -1556,6 +1557,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) static int bch2_sync_fs(struct super_block *sb, int wait) { struct bch_fs *c = sb->s_fs_info; + int ret; if (c->opts.journal_flush_disabled) return 0; @@ -1565,7 +1567,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait) return 0; } - return bch2_journal_flush(&c->journal); + ret = bch2_journal_flush(&c->journal); + return bch2_err_class(ret); } static struct bch_fs *bch2_path_to_fs(const char *path) @@ -1621,7 +1624,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) ret = bch2_parse_mount_opts(c, &opts, data); if (ret) - return ret; + goto err; if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); @@ -1635,7 +1638,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) if (ret) { bch_err(c, "error going rw: %i", ret); up_write(&c->state_lock); - return -EINVAL; + ret = -EINVAL; + goto err; } sb->s_flags &= ~SB_RDONLY; @@ -1648,8 +1652,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) if (opts.errors >= 0) c->opts.errors = opts.errors; - - return ret; +err: + return bch2_err_class(ret); } static int bch2_show_devname(struct seq_file *seq, struct dentry *root) @@ -1674,7 +1678,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; enum bch_opt_id i; - char buf[512]; + struct printbuf buf = PRINTBUF; + int ret = 0; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; @@ -1686,13 +1691,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - bch2_opt_to_text(&PBUF(buf), c, opt, v, + printbuf_reset(&buf); + bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); seq_putc(seq, ','); - seq_puts(seq, buf); + seq_puts(seq, buf.buf); } - return 0; + if (buf.allocation_failure) + ret = -ENOMEM; + printbuf_exit(&buf); + return ret; } static void bch2_put_super(struct super_block *sb) @@ -1837,7 +1846,7 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); c->vfs_sb = sb; - strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); + strscpy(sb->s_id, c->name, sizeof(sb->s_id)); ret = super_setup_bdi(sb); if (ret) @@ -1865,10 +1874,9 @@ got_sb: sb->s_shrink.seeks = 0; vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); - if (IS_ERR(vinode)) { - bch_err(c, "error mounting: error getting root inode %i", - (int) PTR_ERR(vinode)); - ret = PTR_ERR(vinode); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) { + bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); goto err_put_super; } @@ -1909,8 +1917,7 @@ MODULE_ALIAS_FS("bcachefs"); void bch2_vfs_exit(void) { unregister_filesystem(&bcache_fs_type); - if (bch2_inode_cache) - kmem_cache_destroy(bch2_inode_cache); + kmem_cache_destroy(bch2_inode_cache); } int __init bch2_vfs_init(void) diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index b2211ec..9f4b57e 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -191,7 +191,7 @@ int bch2_setattr_nonsize(struct user_namespace *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); -void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); +void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -199,7 +199,7 @@ int bch2_vfs_init(void); #else static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) {} + snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index ced4d67..ca95d85 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "bkey_buf.h" #include "btree_update.h" +#include "darray.h" #include "dirent.h" #include "error.h" #include "fs-common.h" @@ -18,6 +19,10 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +/* + * XXX: this is handling transaction restarts without returning + * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: + */ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, u32 snapshot) { @@ -135,9 +140,9 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); err: - if (ret && ret != -EINTR) - bch_err(trans->c, "error %i fetching inode %llu", - ret, inode_nr); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "error fetching inode %llu: %s", + inode_nr, bch2_err_str(ret)); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -163,9 +168,9 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - if (ret && ret != -EINTR) - bch_err(trans->c, "error %i fetching inode %llu:%u", - ret, inode_nr, *snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "error fetching inode %llu:%u: %s", + inode_nr, *snapshot, bch2_err_str(ret)); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -219,35 +224,39 @@ static int write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - int ret = __bch2_trans_do(trans, NULL, NULL, + int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, __write_inode(trans, inode, snapshot)); if (ret) - bch_err(trans->c, "error in fsck: error %i updating inode", ret); + bch_err(trans->c, "error in fsck: error updating inode: %s", + bch2_err_str(ret)); return ret; } static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) { + struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; int ret; - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL); + do { + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + } while (ret == -BCH_ERR_transaction_restart_nested); if (ret) goto err; retry: @@ -262,7 +271,7 @@ retry: goto err; if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(trans->c, + bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); ret = -EIO; @@ -272,11 +281,8 @@ retry: bch2_inode_unpack(k, &inode_u); /* Subvolume root? */ - if (inode_u.bi_subvol) { - ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); - if (ret) - goto err; - } + if (inode_u.bi_subvol) + bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; @@ -287,10 +293,10 @@ retry: BTREE_INSERT_NOFAIL); err: bch2_trans_iter_exit(trans, &iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - return ret; + return ret ?: -BCH_ERR_transaction_restart_nested; } static int __remove_dirent(struct btree_trans *trans, struct bpos pos) @@ -303,15 +309,19 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) ret = lookup_first_inode(trans, pos.inode, &dir_inode); if (ret) - return ret; + goto err; dir_hash_info = bch2_hash_info_init(c, &dir_inode); bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, 0); + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret)); return ret; } @@ -346,8 +356,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, goto create_lostfound; } - if (ret && ret != -EINTR) - bch_err(c, "error looking up lost+found: %i", ret); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); if (ret) return ret; @@ -369,8 +379,8 @@ create_lostfound: lostfound, &lostfound_str, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { }, 0); - if (ret && ret != -EINTR) - bch_err(c, "error creating lost+found: %i", ret); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); return ret; } @@ -429,13 +439,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 inode_snapshot) { - int ret = __bch2_trans_do(trans, NULL, NULL, + int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL, __reattach_inode(trans, inode, inode_snapshot)); if (ret) { - bch_err(trans->c, "error %i reattaching inode %llu", - ret, inode->bi_inum); + bch_err(trans->c, "error reattaching inode %llu: %s", + inode->bi_inum, bch2_err_str(ret)); return ret; } @@ -466,19 +476,82 @@ out: return ret; } -static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) +struct snapshots_seen_entry { + u32 id; + u32 equiv; +}; + +struct snapshots_seen { + struct bpos pos; + DARRAY(struct snapshots_seen_entry) ids; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + darray_exit(&s->ids); +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + struct snapshots_seen_entry *i, n = { id, id }; + int ret; + + darray_for_each(s->ids, i) { + if (n.equiv < i->equiv) + break; + + if (i->equiv == n.equiv) { + bch_err(c, "adding duplicate snapshot in snapshots_seen_add()"); + return -EINVAL; + } + } + + ret = darray_insert_item(&s->ids, i - s->ids.data, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; +} + +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, + enum btree_id btree_id, struct bpos pos) { - pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + struct snapshots_seen_entry *i, n = { + .id = pos.snapshot, + .equiv = bch2_snapshot_equiv(c, pos.snapshot), + }; + int ret = 0; if (bkey_cmp(s->pos, pos)) - s->nr = 0; + s->ids.nr = 0; + + pos.snapshot = n.equiv; s->pos = pos; - /* Might get called multiple times due to lock restarts */ - if (s->nr && s->d[s->nr - 1] == pos.snapshot) - return 0; + darray_for_each(s->ids, i) + if (i->equiv == n.equiv) { + if (fsck_err_on(i->id != n.id, c, + "snapshot deletion did not run correctly:\n" + " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", + bch2_btree_ids[btree_id], + pos.inode, pos.offset, + i->id, n.id, n.equiv)) + return -BCH_ERR_need_snapshot_cleanup; + + return 0; + } - return snapshots_seen_add(c, s, pos.snapshot); + ret = darray_push(&s->ids, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); +fsck_err: + return ret; } /** @@ -491,15 +564,15 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see u32 id, u32 ancestor) { ssize_t i; + u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0; BUG_ON(id > ancestor); - - id = snapshot_t(c, id)->equiv; - ancestor = snapshot_t(c, ancestor)->equiv; + BUG_ON(!bch2_snapshot_is_equiv(c, id)); + BUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ - BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); - BUG_ON(seen->pos.snapshot != ancestor); + BUG_ON(ancestor != seen->pos.snapshot); + BUG_ON(ancestor != top); if (id == ancestor) return true; @@ -507,11 +580,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see if (!bch2_snapshot_is_ancestor(c, id, ancestor)) return false; - for (i = seen->nr - 2; - i >= 0 && seen->d[i] >= id; + for (i = seen->ids.nr - 2; + i >= 0 && seen->ids.data[i].equiv >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && - bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) && + bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor)) return false; return true; @@ -536,27 +609,27 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, : bch2_snapshot_is_ancestor(c, src, dst); } -#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ + (_i)->snapshot <= (_snapshot); _i++) \ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) +struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; +}; + struct inode_walker { bool first_this_inode; u64 cur_inum; - size_t nr; - size_t size; - struct inode_walker_entry { - struct bch_inode_unpacked inode; - u32 snapshot; - u64 count; - } *d; + DARRAY(struct inode_walker_entry) inodes; }; static void inode_walker_exit(struct inode_walker *w) { - kfree(w->d); - w->d = NULL; + darray_exit(&w->inodes); } static struct inode_walker inode_walker_init(void) @@ -564,43 +637,17 @@ static struct inode_walker inode_walker_init(void) return (struct inode_walker) { 0, }; } -static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w) -{ - if (w->nr == w->size) { - size_t new_size = max_t(size_t, 8UL, w->size * 2); - void *d = krealloc(w->d, new_size * sizeof(w->d[0]), - GFP_KERNEL); - if (!d) { - bch_err(c, "fsck: error allocating memory for inode_walker, size %zu", - new_size); - return -ENOMEM; - } - - w->d = d; - w->size = new_size; - } - - return 0; -} - static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { struct bch_inode_unpacked u; - int ret; - - ret = inode_walker_realloc(c, w); - if (ret) - return ret; BUG_ON(bch2_inode_unpack(inode, &u)); - w->d[w->nr++] = (struct inode_walker_entry) { + return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, - }; - - return 0; + .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + })); } static int __walk_inode(struct btree_trans *trans, @@ -609,17 +656,18 @@ static int __walk_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - unsigned i, ancestor_pos; + u32 restart_count = trans->restart_count; + unsigned i; int ret; - pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot); if (pos.inode == w->cur_inum) { w->first_this_inode = false; goto lookup_snapshot; } - w->nr = 0; + w->inodes.nr = 0; for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -636,27 +684,33 @@ static int __walk_inode(struct btree_trans *trans, w->cur_inum = pos.inode; w->first_this_inode = true; + + if (trans_was_restarted(trans, restart_count)) + return -BCH_ERR_transaction_restart_nested; + lookup_snapshot: - for (i = 0; i < w->nr; i++) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + for (i = 0; i < w->inodes.nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) goto found; return INT_MAX; found: - BUG_ON(pos.snapshot > w->d[i].snapshot); + BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); + + if (pos.snapshot != w->inodes.data[i].snapshot) { + struct inode_walker_entry e = w->inodes.data[i]; + + e.snapshot = pos.snapshot; + e.count = 0; - if (pos.snapshot != w->d[i].snapshot) { - ancestor_pos = i; + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", + pos.inode, pos.snapshot, w->inodes.data[i].snapshot); - while (i && w->d[i - 1].snapshot > pos.snapshot) + while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) --i; - ret = inode_walker_realloc(c, w); + ret = darray_insert_item(&w->inodes, i, e); if (ret) return ret; - - array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); - w->d[i].snapshot = pos.snapshot; - w->d[i].count = 0; } return i; @@ -672,21 +726,23 @@ static int __get_visible_inodes(struct btree_trans *trans, struct bkey_s_c k; int ret; - w->nr = 0; + w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (k.k->p.offset != inum) break; - if (!bkey_is_inode(k.k)) + if (!ref_visible(c, s, s->pos.snapshot, equiv)) continue; - if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { + if (bkey_is_inode(k.k)) add_inode(c, w, k); - if (k.k->p.snapshot >= s->pos.snapshot) - break; - } + + if (equiv >= s->pos.snapshot) + break; } bch2_trans_iter_exit(trans, &iter); @@ -698,15 +754,16 @@ static int check_key_has_snapshot(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; - if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; fsck_err: + printbuf_exit(&buf); return ret; } @@ -715,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c k) { - bch_err(trans->c, "hash_redo_key() not implemented yet"); - return -EINVAL; -#if 0 struct bkey_i *delete; struct bkey_i *tmp; @@ -735,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); -#endif + bch2_hash_set_snapshot(trans, desc, hash_info, + (subvol_inum) { 0, k.k->p.inode }, + k.k->p.snapshot, tmp, + BCH_HASH_SET_MUST_CREATE, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); } static int hash_check_key(struct btree_trans *trans, @@ -746,7 +806,7 @@ static int hash_check_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; - char buf[200]; + struct printbuf buf = PRINTBUF; struct bkey_s_c k; u64 hash; int ret = 0; @@ -762,16 +822,18 @@ static int hash_check_key(struct btree_trans *trans, if (hash_k.k->p.offset < hash) goto bad_hash; - for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash), - BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, desc.btree_id, + POS(hash_k.k->p.inode, hash), + BTREE_ITER_SLOTS, k, ret) { if (!bkey_cmp(k.k->p, hash_k.k->p)) break; if (fsck_err_on(k.k->type == desc.key_type && !desc.cmp_bkey(k, hash_k), c, "duplicate hash table keys:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - hash_k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + buf.buf))) { ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; break; } @@ -780,49 +842,49 @@ static int hash_check_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); goto bad_hash; } - } +out: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; bad_hash: - if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " + if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, " "hashed to %llu\n%s", - desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, - (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) - return 0; - - ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - if (ret) { - bch_err(c, "hash_redo_key err %i", ret); - return ret; + bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); + if (ret) { + bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); + return ret; + } + ret = -BCH_ERR_transaction_restart_nested; } - return -EINTR; fsck_err: - return ret; + goto out; } static int check_inode(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct bch_inode_unpacked *prev, + struct snapshots_seen *s, bool full) { struct bch_fs *c = trans->c; - struct bkey_s_c k; struct bch_inode_unpacked u; bool do_update = false; int ret; - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); + ret = check_key_has_snapshot(trans, iter, k); + if (ret < 0) + goto err; if (ret) - return ret; + return 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) - return ret < 0 ? ret : 0; + goto err; /* * if snapshot id isn't a leaf node, skip it - deletion in @@ -861,8 +923,9 @@ static int check_inode(struct btree_trans *trans, bch2_fs_lazy_rw(c); ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); - if (ret) - bch_err(c, "error in fsck: error %i while deleting inode", ret); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error in fsck: error while deleting inode: %s", + bch2_err_str(ret)); return ret; } @@ -885,7 +948,8 @@ static int check_inode(struct btree_trans *trans, POS(u.bi_inum, U64_MAX), 0, NULL); if (ret) { - bch_err(c, "error in fsck: error %i truncating inode", ret); + bch_err(c, "error in fsck: error truncating inode: %s", + bch2_err_str(ret)); return ret; } @@ -910,8 +974,8 @@ static int check_inode(struct btree_trans *trans, sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { - bch_err(c, "error in fsck: error %i recounting inode sectors", - (int) sectors); + bch_err(c, "error in fsck: error recounting inode sectors: %s", + bch2_err_str(sectors)); return sectors; } @@ -928,12 +992,15 @@ static int check_inode(struct btree_trans *trans, } if (do_update) { - ret = write_inode(trans, &u, iter->pos.snapshot); + ret = __write_inode(trans, &u, iter->pos.snapshot); if (ret) - bch_err(c, "error in fsck: error %i " - "updating inode", ret); + bch_err(c, "error in fsck: error updating inode: %s", + bch2_err_str(ret)); } +err: fsck_err: + if (ret) + bch_err(c, "error from check_inode(): %s", bch2_err_str(ret)); return ret; } @@ -943,86 +1010,23 @@ static int check_inodes(struct bch_fs *c, bool full) struct btree_trans trans; struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - do { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_inode(&trans, &iter, &prev, full)); - if (ret) - break; - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - -static int check_subvol(struct btree_trans *trans, - struct btree_iter *iter) -{ + struct snapshots_seen s; struct bkey_s_c k; - struct bkey_s_c_subvolume subvol; - int ret; - - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); - if (ret) - return ret; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol = bkey_s_c_to_subvolume(k); - - if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - if (ret && ret != -EINTR) - bch_err(trans->c, "error deleting subvolume %llu: %i", - iter->pos.offset, ret); - if (ret) - return ret; - } - - return 0; -} - -noinline_for_stack -static int check_subvols(struct bch_fs *c) -{ - struct btree_trans trans; - struct btree_iter iter; int ret; + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes, - POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); - - do { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_subvol(&trans, &iter)); - if (ret) - break; - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_inode(&trans, &iter, k, &prev, &s, full)); bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + if (ret) + bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret)); return ret; } @@ -1114,7 +1118,7 @@ static int inode_backpointer_exists(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); ret = bkey_err(d.s_c); if (ret) - return ret; + return ret == -ENOENT ? 0 : ret; ret = dirent_points_to_inode(d, inode); bch2_trans_iter_exit(trans, &iter); @@ -1125,15 +1129,15 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; - int ret = 0, ret2 = 0; + u32 restart_count = trans->restart_count; + int ret = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_sectors == i->count) continue; - count2 = lockrestart_do(trans, - bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); + count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot); if (i->count != count2) { bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", @@ -1146,53 +1150,55 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", w->cur_inum, i->snapshot, - i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) - continue; - - i->inode.bi_sectors = i->count; - ret = write_inode(trans, &i->inode, i->snapshot); - if (ret) - break; - ret2 = -EINTR; + i->inode.bi_sectors, i->count)) { + i->inode.bi_sectors = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + } } fsck_err: - return ret ?: ret2; + if (ret) + bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret)); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; } static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct inode_walker *inode, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c k; struct inode_walker_entry *i; - char buf[200]; + struct printbuf buf = PRINTBUF; + struct bpos equiv; int ret = 0; - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); - if (ret) - return ret; - ret = check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } + + equiv = k.k->p; + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = snapshots_seen_update(c, s, k.k->p); + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) - return ret; + goto err; if (k.k->type == KEY_TYPE_whiteout) - return 0; + goto out; if (inode->cur_inum != k.k->p.inode) { ret = check_i_sectors(trans, inode); if (ret) - return ret; + goto err; } + + BUG_ON(!iter->path->should_be_locked); #if 0 if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { char buf1[200]; @@ -1201,59 +1207,95 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); bch2_bkey_val_to_text(&PBUF(buf2), c, k); - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) - return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { + ret = fix_overlapping_extent(trans, k, prev.k->k.p) + ?: -BCH_ERR_transaction_restart_nested; + goto out; + } } #endif - ret = __walk_inode(trans, inode, k.k->p); + ret = __walk_inode(trans, inode, equiv); if (ret < 0) - return ret; + goto err; if (fsck_err_on(ret == INT_MAX, c, "extent in missing inode:\n %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } - if (ret == INT_MAX) - return 0; + if (ret == INT_MAX) { + ret = 0; + goto out; + } - i = inode->d + ret; + i = inode->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && !S_ISLNK(i->inode.bi_mode), c, "extent in non regular inode mode %o:\n %s", i->inode.bi_mode, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } + + /* + * Check inodes in reverse order, from oldest snapshots to newest, so + * that we emit the fewest number of whiteouts necessary: + */ + for (i = inode->inodes.data + inode->inodes.nr - 1; + i >= inode->inodes.data; + --i) { + if (i->snapshot > equiv.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + continue; + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->type != KEY_TYPE_reservation && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, + "extent type past end of inode %llu:%u, i_size %llu\n %s", + i->inode.bi_inum, i->snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + bch2_btree_iter_set_snapshot(&iter2, i->snapshot); + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_btree_delete_at(trans, &iter2, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter2); + if (ret) + goto err; - if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { - for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, - k.k->p.snapshot), - POS(k.k->p.inode, U64_MAX), - 0, NULL) ?: -EINTR; + if (i->snapshot != equiv.snapshot) { + ret = snapshots_seen_add(c, s, i->snapshot); + if (ret) + goto err; } } } if (bkey_extent_is_allocation(k.k)) - for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) + for_each_visible_inode(c, s, inode, equiv.snapshot, i) i->count += k.k->size; #if 0 bch2_bkey_buf_reassemble(&prev, c, k); #endif +out: +err: fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error from check_extent(): %s", bch2_err_str(ret)); return ret; } @@ -1268,6 +1310,7 @@ static int check_extents(struct bch_fs *c) struct snapshots_seen s; struct btree_trans trans; struct btree_iter iter; + struct bkey_s_c k; int ret = 0; #if 0 @@ -1280,21 +1323,12 @@ static int check_extents(struct bch_fs *c) bch_verbose(c, "checking extents"); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - do { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_extent(&trans, &iter, &w, &s)); - if (ret) - break; - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_extent(&trans, &iter, k, &w, &s)); #if 0 bch2_bkey_buf_exit(&prev, c); #endif @@ -1302,6 +1336,8 @@ static int check_extents(struct bch_fs *c) bch2_trans_exit(&trans); snapshots_seen_exit(&s); + if (ret) + bch_err(c, "error from check_extents(): %s", bch2_err_str(ret)); return ret; } @@ -1309,10 +1345,11 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; - int ret = 0, ret2 = 0; + u32 restart_count = trans->restart_count; + int ret = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_nlink == i->count) continue; @@ -1335,11 +1372,14 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) ret = write_inode(trans, &i->inode, i->snapshot); if (ret) break; - ret2 = -EINTR; } } fsck_err: - return ret ?: ret2; + if (ret) + bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret)); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; } static int check_dirent_target(struct btree_trans *trans, @@ -1351,7 +1391,7 @@ static int check_dirent_target(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_i_dirent *n; bool backpointer_exists = true; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; if (!target->bi_dir && @@ -1377,15 +1417,13 @@ static int check_dirent_target(struct btree_trans *trans, "directory %llu with multiple links", target->bi_inum)) { ret = __remove_dirent(trans, d.k->p); - if (ret) - goto err; - return 0; + goto out; } if (fsck_err_on(backpointer_exists && !target->bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - target->bi_inum)) { + "inode %llu type %s has multiple links but i_nlink 0", + target->bi_inum, bch2_d_types[d.v->d_type])) { target->bi_nlink++; target->bi_flags &= ~BCH_INODE_UNLINKED; @@ -1416,18 +1454,19 @@ static int check_dirent_target(struct btree_trans *trans, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), - (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) - return ret; + goto err; d = dirent_i_to_s_c(n); } @@ -1441,94 +1480,110 @@ static int check_dirent_target(struct btree_trans *trans, n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, d.s_c); n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) - return ret; + goto err; d = dirent_i_to_s_c(n); } +out: err: fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error from check_target(): %s", bch2_err_str(ret)); return ret; } static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct bch_hash_info *hash_info, struct inode_walker *dir, struct inode_walker *target, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c k; struct bkey_s_c_dirent d; struct inode_walker_entry *i; - char buf[200]; - int ret; - - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); - if (ret) - return ret; + struct printbuf buf = PRINTBUF; + struct bpos equiv; + int ret = 0; ret = check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } - ret = snapshots_seen_update(c, s, k.k->p); + equiv = k.k->p; + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) - return ret; + goto err; if (k.k->type == KEY_TYPE_whiteout) - return 0; + goto out; if (dir->cur_inum != k.k->p.inode) { ret = check_subdir_count(trans, dir); if (ret) - return ret; + goto err; } - ret = __walk_inode(trans, dir, k.k->p); + BUG_ON(!iter->path->should_be_locked); + + ret = __walk_inode(trans, dir, equiv); if (ret < 0) - return ret; + goto err; if (fsck_err_on(ret == INT_MAX, c, "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } - if (ret == INT_MAX) - return 0; + if (ret == INT_MAX) { + ret = 0; + goto out; + } - i = dir->d + ret; + i = dir->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, "dirent in non directory inode type %s:\n%s", bch2_d_type_str(inode_d_type(&i->inode)), - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, 0); + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) - return ret; - if (ret) /* dirent has been deleted */ - return 0; + goto err; + if (ret) { + /* dirent has been deleted */ + ret = 0; + goto out; + } if (k.k->type != KEY_TYPE_dirent) - return 0; + goto out; d = bkey_s_c_to_dirent(k); @@ -1541,24 +1596,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ret = __subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && ret != -ENOENT) - return ret; + goto err; if (fsck_err_on(ret, c, "dirent points to missing subvolume %llu", - le64_to_cpu(d.v->d_child_subvol))) - return __remove_dirent(trans, d.k->p); + le64_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); + goto err; + } ret = __lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && ret != -ENOENT) - return ret; + goto err; if (fsck_err_on(ret, c, "subvolume %u points to missing subvolume root %llu", target_subvol, target_inum)) { bch_err(c, "repair not implemented yet"); - return -EINVAL; + ret = -EINVAL; + goto err; } if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, @@ -1568,40 +1626,48 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, subvol_root.bi_subvol = target_subvol; ret = __write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } else { ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) - return ret; + goto err; - if (fsck_err_on(!target->nr, c, - "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { + if (fsck_err_on(!target->inodes.nr, c, + "dirent points to missing inode: (equiv %u)\n%s", + equiv.snapshot, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { ret = __remove_dirent(trans, d.k->p); if (ret) - return ret; + goto err; } - for (i = target->d; i < target->d + target->nr; i++) { + darray_for_each(target->inodes, i) { ret = check_dirent_target(trans, iter, d, &i->inode, i->snapshot); if (ret) - return ret; + goto err; } } if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) i->count++; +out: +err: fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret)); return ret; } @@ -1618,6 +1684,7 @@ static int check_dirents(struct bch_fs *c) struct bch_hash_info hash_info; struct btree_trans trans; struct btree_iter iter; + struct bkey_s_c k; int ret = 0; bch_verbose(c, "checking dirents"); @@ -1625,46 +1692,32 @@ static int check_dirents(struct bch_fs *c) snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - do { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_dirent(&trans, &iter, &hash_info, - &dir, &target, &s)); - if (ret) - break; - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); bch2_trans_exit(&trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); + + if (ret) + bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret)); return ret; } static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct bch_hash_info *hash_info, struct inode_walker *inode) { struct bch_fs *c = trans->c; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); - if (ret) - return ret; - ret = check_key_has_snapshot(trans, iter, k); if (ret) return ret; @@ -1684,10 +1737,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ret = 0; if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); fsck_err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret)); return ret; } @@ -1701,30 +1756,25 @@ static int check_xattrs(struct bch_fs *c) struct bch_hash_info hash_info; struct btree_trans trans; struct btree_iter iter; + struct bkey_s_c k; int ret = 0; bch_verbose(c, "checking xattrs"); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - do { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_xattr(&trans, &iter, &hash_info, - &inode)); - if (ret) - break; - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_xattr(&trans, &iter, k, &hash_info, &inode)); bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret)); return ret; } @@ -1751,12 +1801,12 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.flags = 0; root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); - ret = __bch2_trans_do(trans, NULL, NULL, + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); if (ret) { - bch_err(c, "error writing root subvol: %i", ret); + bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); goto err; } @@ -1775,7 +1825,7 @@ static int check_root_trans(struct btree_trans *trans) ret = __write_inode(trans, &root_inode, snapshot); if (ret) - bch_err(c, "error writing root inode: %i", ret); + bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); } err: fsck_err: @@ -1794,21 +1844,18 @@ static int check_root(struct bch_fs *c) check_root_trans(&trans)); } -struct pathbuf { - size_t nr; - size_t size; - - struct pathbuf_entry { - u64 inum; - u32 snapshot; - } *entries; +struct pathbuf_entry { + u64 inum; + u32 snapshot; }; -static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) +typedef DARRAY(struct pathbuf_entry) pathbuf; + +static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { struct pathbuf_entry *i; - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; @@ -1816,29 +1863,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, struct pathbuf *p, +static int path_down(struct bch_fs *c, pathbuf *p, u64 inum, u32 snapshot) { - if (p->nr == p->size) { - size_t new_size = max_t(size_t, 256UL, p->size * 2); - void *n = krealloc(p->entries, - new_size * sizeof(p->entries[0]), - GFP_KERNEL); - if (!n) { - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - new_size); - return -ENOMEM; - } - - p->entries = n; - p->size = new_size; - }; - - p->entries[p->nr++] = (struct pathbuf_entry) { + int ret = darray_push(p, ((struct pathbuf_entry) { .inum = inum, .snapshot = snapshot, - }; - return 0; + })); + + if (ret) + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + p->size); + return ret; } /* @@ -1847,14 +1883,14 @@ static int path_down(struct bch_fs *c, struct pathbuf *p, * XXX: we should also be verifying that inodes are in the right subvolumes */ static int check_path(struct btree_trans *trans, - struct pathbuf *p, + pathbuf *p, struct bch_inode_unpacked *inode, u32 snapshot) { struct bch_fs *c = trans->c; int ret = 0; - snapshot = snapshot_t(c, snapshot)->equiv; + snapshot = bch2_snapshot_equiv(c, snapshot); p->nr = 0; while (!(inode->bi_inum == BCACHEFS_ROOT_INO && @@ -1921,14 +1957,14 @@ static int check_path(struct btree_trans *trans, /* XXX print path */ bch_err(c, "directory structure loop"); - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); if (!fsck_err(c, "directory structure loop")) return 0; - ret = __bch2_trans_do(trans, NULL, NULL, + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, remove_backpointer(trans, inode)); @@ -1942,7 +1978,7 @@ static int check_path(struct btree_trans *trans, } fsck_err: if (ret) - bch_err(c, "%s: err %i", __func__, ret); + bch_err(c, "%s: err %s", __func__, bch2_err_str(ret)); return ret; } @@ -1958,7 +1994,7 @@ static int check_directory_structure(struct bch_fs *c) struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; - struct pathbuf path = { 0, 0, NULL }; + pathbuf path = { 0, }; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -1986,9 +2022,7 @@ static int check_directory_structure(struct bch_fs *c) } bch2_trans_iter_exit(&trans, &iter); - BUG_ON(ret == -EINTR); - - kfree(path.entries); + darray_exit(&path); bch2_trans_exit(&trans); return ret; @@ -2010,7 +2044,8 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, { if (t->nr == t->size) { size_t new_size = max_t(size_t, 128UL, t->size * 2); - void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); + void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); + if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); @@ -2139,7 +2174,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = snapshots_seen_update(c, &s, k.k->p); + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); if (ret) break; @@ -2151,7 +2186,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links d.v->d_type != DT_SUBVOL) inc_link(c, &s, links, range_start, range_end, le64_to_cpu(d.v->d_inum), - d.k->p.snapshot); + bch2_snapshot_equiv(c, d.k->p.snapshot)); break; } } @@ -2165,6 +2200,47 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links return ret; } +static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct nlink_table *links, + size_t *idx, u64 range_end) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct nlink *link = &links->d[*idx]; + int ret = 0; + + if (k.k->p.offset >= range_end) + return 1; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (S_ISDIR(le16_to_cpu(u.bi_mode))) + return 0; + + if (!u.bi_nlink) + return 0; + + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { + BUG_ON(*idx == links->nr); + link = &links->d[++*idx]; + } + + if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, + "inode %llu type %s has wrong i_nlink (%u, should be %u)", + u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], + bch2_inode_nlink_get(&u), link->count)) { + bch2_inode_nlink_set(&u, link->count); + ret = __write_inode(trans, &u, k.k->p.snapshot); + } +fsck_err: + return ret; +} + noinline_for_stack static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, @@ -2173,56 +2249,25 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct bch_inode_unpacked u; - struct nlink *link = links->d; + size_t idx = 0; int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (k.k->p.offset >= range_end) - break; - - if (!bkey_is_inode(k.k)) - continue; - - BUG_ON(bch2_inode_unpack(k, &u)); - - if (S_ISDIR(le16_to_cpu(u.bi_mode))) - continue; + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); - if (!u.bi_nlink) - continue; - - while ((cmp_int(link->inum, k.k->p.offset) ?: - cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { - link++; - BUG_ON(link >= links->d + links->nr); - } - - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, - "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", - u.bi_inum, mode_to_type(u.bi_mode), - bch2_inode_nlink_get(&u), link->count)) { - bch2_inode_nlink_set(&u, link->count); - - ret = write_inode(&trans, &u, k.k->p.snapshot); - if (ret) - bch_err(c, "error in fsck: error %i updating inode", ret); - } - } -fsck_err: - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret) + if (ret < 0) { bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + return ret; + } - return ret; + return 0; } noinline_for_stack @@ -2262,21 +2307,13 @@ static int check_nlinks(struct bch_fs *c) return ret; } -static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) +static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) { - struct bkey_s_c k; struct bkey_s_c_reflink_p p; struct bkey_i_reflink_p *u; int ret; - k = bch2_btree_iter_peek(iter); - if (!k.k) - return 0; - - ret = bkey_err(k); - if (ret) - return ret; - if (k.k->type != KEY_TYPE_reflink_p) return 0; @@ -2312,20 +2349,11 @@ static int fix_reflink_p(struct bch_fs *c) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (k.k->type == KEY_TYPE_reflink_p) { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - fix_reflink_p_key(&trans, &iter)); - if (ret) - break; - } - } - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(&trans, &iter, k)); bch2_trans_exit(&trans); return ret; @@ -2337,9 +2365,12 @@ static int fix_reflink_p(struct bch_fs *c) */ int bch2_fsck_full(struct bch_fs *c) { - return bch2_fs_snapshots_check(c) ?: + int ret; +again: + ret = bch2_fs_check_snapshots(c) ?: + bch2_fs_check_subvols(c) ?: + bch2_delete_dead_snapshots(c) ?: check_inodes(c, true) ?: - check_subvols(c) ?: check_extents(c) ?: check_dirents(c) ?: check_xattrs(c) ?: @@ -2347,9 +2378,19 @@ int bch2_fsck_full(struct bch_fs *c) check_directory_structure(c) ?: check_nlinks(c) ?: fix_reflink_p(c); + + if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + goto again; + } + + return ret; } int bch2_fsck_walk_inodes_only(struct bch_fs *c) { - return check_inodes(c, false); + return bch2_fs_check_snapshots(c) ?: + bch2_fs_check_subvols(c) ?: + bch2_delete_dead_snapshots(c) ?: + check_inodes(c, false); } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 78e2db6..1a0d260 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -60,11 +60,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -void bch2_inode_pack(struct bch_fs *c, - struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - struct bkey_i_inode_v2 *k = &packed->inode; + struct bkey_i_inode_v3 *k = &packed->inode; u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; @@ -72,13 +71,17 @@ void bch2_inode_pack(struct bch_fs *c, unsigned bytes; int ret; - bkey_inode_v2_init(&packed->inode.k_i); + bkey_inode_v3_init(&packed->inode.k_i); packed->inode.k.p.offset = inode->bi_inum; packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); packed->inode.v.bi_hash_seed = inode->bi_hash_seed; packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); - packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); + packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); + packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); + SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); + SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); + #define x(_name, _bits) \ nr_fields++; \ @@ -99,7 +102,7 @@ void bch2_inode_pack(struct bch_fs *c, *out++ = 0; \ } - BCH_INODE_FIELDS() + BCH_INODE_FIELDS_v3() #undef x BUG_ON(out > end); @@ -110,7 +113,7 @@ void bch2_inode_pack(struct bch_fs *c, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODEv2_NR_FIELDS(&k->v, nr_fields); + SET_INODEv3_NR_FIELDS(&k->v, nr_fields); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; @@ -120,16 +123,25 @@ void bch2_inode_pack(struct bch_fs *c, BUG_ON(ret); BUG_ON(unpacked.bi_inum != inode->bi_inum); BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); + BUG_ON(unpacked.bi_sectors != inode->bi_sectors); + BUG_ON(unpacked.bi_size != inode->bi_size); + BUG_ON(unpacked.bi_version != inode->bi_version); BUG_ON(unpacked.bi_mode != inode->bi_mode); #define x(_name, _bits) if (unpacked._name != inode->_name) \ panic("unpacked %llu should be %llu", \ (u64) unpacked._name, (u64) inode->_name); - BCH_INODE_FIELDS() + BCH_INODE_FIELDS_v3() #undef x } } +void bch2_inode_pack(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bch2_inode_pack_inlined(packed, inode); +} + static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, struct bch_inode_unpacked *unpacked) { @@ -141,9 +153,9 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, #define x(_name, _bits) \ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ - memset(&unpacked->_name, 0, \ - sizeof(*unpacked) - \ - offsetof(struct bch_inode_unpacked, _name)); \ + unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ + memset((void *) unpacked + offset, 0, \ + sizeof(*unpacked) - offset); \ return 0; \ } \ \ @@ -157,7 +169,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, unpacked->_name = field[1]; \ in += ret; - BCH_INODE_FIELDS() + BCH_INODE_FIELDS_v2() #undef x /* XXX: signal if there were more fields than expected? */ @@ -196,15 +208,66 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, return -1; \ fieldnr++; - BCH_INODE_FIELDS() + BCH_INODE_FIELDS_v2() #undef x /* XXX: signal if there were more fields than expected? */ return 0; } -int bch2_inode_unpack(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) +static int bch2_inode_unpack_v3(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); + unpacked->bi_size = le64_to_cpu(inode.v->bi_size); + unpacked->bi_version = le64_to_cpu(inode.v->bi_version); + unpacked->bi_mode = INODEv3_MODE(inode.v); + +#define x(_name, _bits) \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS_v3() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) { switch (k.k->type) { case KEY_TYPE_inode: { @@ -243,6 +306,14 @@ int bch2_inode_unpack(struct bkey_s_c k, } } +int bch2_inode_unpack(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + if (likely(k.k->type == KEY_TYPE_inode_v3)) + return bch2_inode_unpack_v3(k, unpacked); + return bch2_inode_unpack_slowpath(k, unpacked); +} + int bch2_inode_peek(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -288,124 +359,192 @@ int bch2_inode_write(struct btree_trans *trans, if (IS_ERR(inode_p)) return PTR_ERR(inode_p); - bch2_inode_pack(trans->c, inode_p, inode); + bch2_inode_pack_inlined(inode_p, inode); inode_p->inode.k.p.snapshot = iter->snapshot; return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); } -const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) +struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; + struct bch_inode_unpacked u; + struct bkey_inode_buf *inode_p; + int ret; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return bkey_s_c_err(PTR_ERR(inode_p)); + + ret = bch2_inode_unpack(k, &u); + if (ret) + return bkey_s_c_err(ret); - if (k.k->p.inode) - return "nonzero k.p.inode"; + bch2_inode_pack(inode_p, &u); + return bkey_i_to_s_c(&inode_p->inode.k_i); +} - if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) - return "incorrect value size"; +static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) +{ + struct bch_inode_unpacked unpacked; - if (k.k->p.offset < BLOCKDEV_INODE_MAX) - return "fs inode in blockdev range"; + if (k.k->p.inode) { + prt_printf(err, "nonzero k.p.inode"); + return -EINVAL; + } - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) - return "invalid str hash type"; + if (k.k->p.offset < BLOCKDEV_INODE_MAX) { + prt_printf(err, "fs inode in blockdev range"); + return -EINVAL; + } - if (bch2_inode_unpack(k, &unpacked)) - return "invalid variable length fields"; + if (bch2_inode_unpack(k, &unpacked)) { + prt_printf(err, "invalid variable length fields"); + return -EINVAL; + } - if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) - return "invalid data checksum type"; + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { + prt_printf(err, "invalid data checksum type (%u >= %u", + unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); + return -EINVAL; + } - if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) - return "invalid data checksum type"; + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { + prt_printf(err, "invalid data checksum type (%u >= %u)", + unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); + return -EINVAL; + } if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && - unpacked.bi_nlink != 0) - return "flagged as unlinked but bi_nlink != 0"; + unpacked.bi_nlink != 0) { + prt_printf(err, "flagged as unlinked but bi_nlink != 0"); + return -EINVAL; + } - if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) - return "subvolume root but not a directory"; + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { + prt_printf(err, "subvolume root but not a directory"); + return -EINVAL; + } - return NULL; + return 0; } -const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - struct bch_inode_unpacked unpacked; + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - if (k.k->p.inode) - return "nonzero k.p.inode"; + if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*inode.v)); + return -EINVAL; + } - if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) - return "incorrect value size"; + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -EINVAL; + } - if (k.k->p.offset < BLOCKDEV_INODE_MAX) - return "fs inode in blockdev range"; + return __bch2_inode_invalid(k, err); +} - if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) - return "invalid str hash type"; +int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - if (bch2_inode_unpack(k, &unpacked)) - return "invalid variable length fields"; + if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*inode.v)); + return -EINVAL; + } - if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) - return "invalid data checksum type"; + if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -EINVAL; + } - if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) - return "invalid data checksum type"; + return __bch2_inode_invalid(k, err); +} - if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && - unpacked.bi_nlink != 0) - return "flagged as unlinked but bi_nlink != 0"; +int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + + if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*inode.v)); + return -EINVAL; + } + + if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { + prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", + INODEv3_FIELDS_START(inode.v), + INODEv3_FIELDS_START_INITIAL, + bkey_val_u64s(inode.k)); + return -EINVAL; + } - if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) - return "subvolume root but not a directory"; + if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -EINVAL; + } - return NULL; + return __bch2_inode_invalid(k, err); } -static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +static void __bch2_inode_unpacked_to_text(struct printbuf *out, + struct bch_inode_unpacked *inode) { - pr_buf(out, "mode %o flags %x journal_seq %llu", + prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", inode->bi_mode, inode->bi_flags, - inode->bi_journal_seq); + inode->bi_journal_seq, + inode->bi_size, + inode->bi_sectors, + inode->bi_version); #define x(_name, _bits) \ - pr_buf(out, " "#_name " %llu", (u64) inode->_name); - BCH_INODE_FIELDS() + prt_printf(out, " "#_name " %llu", (u64) inode->_name); + BCH_INODE_FIELDS_v3() #undef x } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - pr_buf(out, "inum: %llu ", inode->bi_inum); + prt_printf(out, "inum: %llu ", inode->bi_inum); __bch2_inode_unpacked_to_text(out, inode); } -void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bch_inode_unpacked inode; if (bch2_inode_unpack(k, &inode)) { - pr_buf(out, "(unpack error)"); + prt_printf(out, "(unpack error)"); return; } __bch2_inode_unpacked_to_text(out, &inode); } -const char *bch2_inode_generation_invalid(const struct bch_fs *c, - struct bkey_s_c k) +int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (k.k->p.inode) - return "nonzero k.p.inode"; + if (k.k->p.inode) { + prt_printf(err, "nonzero k.p.inode"); + return -EINVAL; + } - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_inode_generation)); + return -EINVAL; + } - return NULL; + return 0; } void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, @@ -413,7 +552,7 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); - pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); + prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } void bch2_inode_init_early(struct bch_fs *c, @@ -549,7 +688,7 @@ again: } if (!ret && start == min) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_inode_create; if (ret) { bch2_trans_iter_exit(trans, iter); @@ -606,12 +745,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); ret = bkey_err(k); if (ret) goto err; - if (!k.k || iter.pos.inode != inum.inum) + if (!k.k) break; bkey_init(&delete.k); @@ -621,7 +760,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - if (ret && ret != -EINTR) + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } @@ -692,7 +831,7 @@ retry: BTREE_INSERT_NOFAIL); err: bch2_trans_iter_exit(&trans, &iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); @@ -718,3 +857,36 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inum, inode)); } + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else { + if (bi->bi_nlink == U32_MAX) + return -EINVAL; + + bi->bi_nlink++; + } + + return 0; +} + +void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) +{ + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", + bi->bi_inum); + return; + } + + if (bi->bi_flags & BCH_INODE_UNLINKED) { + bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); + return; + } + + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; +} diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 77957cc..2915f4f 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -2,34 +2,47 @@ #ifndef _BCACHEFS_INODE_H #define _BCACHEFS_INODE_H +#include "bkey.h" #include "opts.h" extern const char * const bch2_inode_opts[]; -const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode (struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ } #define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ .key_invalid = bch2_inode_v2_invalid, \ .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ +} + +#define bch2_bkey_ops_inode_v3 (struct bkey_ops) { \ + .key_invalid = bch2_inode_v3_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ } static inline bool bkey_is_inode(const struct bkey *k) { return k->type == KEY_TYPE_inode || - k->type == KEY_TYPE_inode_v2; + k->type == KEY_TYPE_inode_v2 || + k->type == KEY_TYPE_inode_v3; } -const char *bch2_inode_generation_invalid(const struct bch_fs *, - struct bkey_s_c); -void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ .key_invalid = bch2_inode_generation_invalid, \ @@ -48,25 +61,28 @@ struct bch_inode_unpacked { u64 bi_inum; u64 bi_journal_seq; __le64 bi_hash_seed; + u64 bi_size; + u64 bi_sectors; + u64 bi_version; u32 bi_flags; u16 bi_mode; #define x(_name, _bits) u##_bits _name; - BCH_INODE_FIELDS() + BCH_INODE_FIELDS_v3() #undef x }; struct bkey_inode_buf { - struct bkey_i_inode_v2 inode; + struct bkey_i_inode_v3 inode; #define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[0 + BCH_INODE_FIELDS()]; + u8 _pad[0 + BCH_INODE_FIELDS_v3()]; #undef x } __attribute__((packed, aligned(8))); -void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, - const struct bch_inode_unpacked *); +void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); +struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); @@ -161,23 +177,6 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } -static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -{ - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; - else - bi->bi_nlink++; -} - -static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) -{ - BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; -} - static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) { return bi->bi_flags & BCH_INODE_UNLINKED @@ -197,4 +196,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, } } +int bch2_inode_nlink_inc(struct bch_inode_unpacked *); +void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + #endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 10f8b3a..5971569 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -242,8 +242,7 @@ int bch2_extent_update(struct btree_trans *trans, s64 *i_sectors_delta_total, bool check_enospc) { - struct btree_iter inode_iter; - struct bch_inode_unpacked inode_u; + struct btree_iter inode_iter = { NULL }; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -283,36 +282,71 @@ int bch2_extent_update(struct btree_trans *trans, return ret; } - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, - BTREE_ITER_INTENT); - if (ret) - return ret; + if (new_i_size || i_sectors_delta) { + struct bkey_s_c k; + struct bkey_s_c_inode_v3 inode; + struct bkey_i_inode_v3 *new_inode; + bool i_size_update; + + bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inum.inum, iter->snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&inode_iter); + ret = bkey_err(k); + if (unlikely(ret)) + goto err; + + ret = bkey_is_inode(k.k) ? 0 : -ENOENT; + if (unlikely(ret)) + goto err; + + if (unlikely(k.k->type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = bkey_err(k); + if (unlikely(ret)) + goto err; + } + + inode = bkey_s_c_to_inode_v3(k); + i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode.v->bi_size); - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) - inode_u.bi_size = new_i_size; + if (!i_sectors_delta && !i_size_update) + goto no_inode_update; - inode_u.bi_sectors += i_sectors_delta; + new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(new_inode); + if (unlikely(ret)) + goto err; + bkey_reassemble(&new_inode->k_i, k); + + if (i_size_update) + new_inode->v.bi_size = cpu_to_le64(new_i_size); + + le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); + ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0); + if (unlikely(ret)) + goto err; + } +no_inode_update: ret = bch2_trans_update(trans, iter, k, 0) ?: - bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret) - return ret; + if (unlikely(ret)) + goto err; if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; bch2_btree_iter_set_pos(iter, next_pos); - - return 0; +err: + bch2_trans_iter_exit(trans, &inode_iter); + return ret; } /* - * Returns -EINTR if we had to drop locks: + * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, subvol_inum inum, u64 end, @@ -325,7 +359,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, int ret = 0, ret2 = 0; u32 snapshot; - while (!ret || ret == -EINTR) { + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; @@ -384,14 +419,16 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return ret == -EINTR ? 0 : ret; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; } int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; - struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; @@ -415,7 +452,7 @@ int bch2_write_index_default(struct bch_write_op *op) ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &sk.k->k.p.snapshot); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; @@ -430,14 +467,11 @@ int bch2_write_index_default(struct bch_write_op *op) op->flags & BCH_WRITE_CHECK_ENOSPC); bch2_trans_iter_exit(&trans, &iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (ec_ob) - bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); - if (bkey_cmp(iter.pos, k->k.p) >= 0) bch2_keylist_pop_front(&op->insert_keys); else @@ -470,8 +504,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ca = bch_dev_bkey_exists(c, ptr->dev); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, - &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, + GFP_NOIO, &ca->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -531,17 +565,11 @@ static void bch2_write_done(struct closure *cl) } } -/** - * bch_write_index - after a write, update index to point to new data - */ -static void __bch2_write_index(struct bch_write_op *op) +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { - struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bch_extent_ptr *ptr; - struct bkey_i *src, *dst = keys->keys, *n, *k; - unsigned dev; - int ret; + struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); @@ -550,10 +578,8 @@ static void __bch2_write_index(struct bch_write_op *op) bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, test_bit(ptr->dev, op->failed.d)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { - ret = -EIO; - goto err; - } + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; } if (dst != src) @@ -562,6 +588,25 @@ static void __bch2_write_index(struct bch_write_op *op) } keys->top = dst; + return 0; +} + +/** + * bch_write_index - after a write, update index to point to new data + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k; + unsigned dev; + int ret; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } /* * probably not the ideal place to hook this in, but I don't @@ -580,14 +625,14 @@ static void __bch2_write_index(struct bch_write_op *op) u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); - BUG_ON(ret == -EINTR); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); BUG_ON(keylist_sectors(keys) && !ret); op->written += sectors_start - keylist_sectors(keys); if (ret) { bch_err_inum_ratelimited(c, op->pos.inode, - "write error %i from btree update", ret); + "write error while doing btree update: %s", bch2_err_str(ret)); op->error = ret; } } @@ -636,8 +681,10 @@ static void bch2_write_endio(struct bio *bio) op->pos.inode, op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status))) { set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -701,7 +748,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, pages = min(pages, BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); + bio = bio_alloc_bioset(NULL, pages, 0, + GFP_NOIO, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ @@ -764,6 +812,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) struct bch_fs *c = op->c; struct nonce nonce = extent_nonce(op->version, op->crc); struct bch_csum csum; + int ret; if (!bch2_csum_type_is_encryption(op->crc.csum_type)) return 0; @@ -778,10 +827,10 @@ static int bch2_write_decrypt(struct bch_write_op *op) if (bch2_crc_cmp(op->crc.csum, csum)) return -EIO; - bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); op->crc.csum_type = 0; op->crc.csum = (struct bch_csum) { 0, 0 }; - return 0; + return ret; } static enum prep_encoded_ret { @@ -911,8 +960,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, saved_iter = dst->bi_iter; do { - struct bch_extent_crc_unpacked crc = - (struct bch_extent_crc_unpacked) { 0 }; + struct bch_extent_crc_unpacked crc = { 0 }; struct bversion version = op->version; size_t dst_len, src_len; @@ -964,6 +1012,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { + u8 compression_type = crc.compression_type; + u16 nonce = crc.nonce; /* * Note: when we're using rechecksum(), we need to be * checksumming @src because it has all the data our @@ -982,6 +1032,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bio_sectors(src) - (src_len >> 9), op->csum_type)) goto csum_err; + /* + * rchecksum_bio sets compression_type on crc from op->crc, + * this isn't always correct as sometimes we're changing + * an extent from uncompressed to incompressible. + */ + crc.compression_type = compression_type; + crc.nonce = nonce; } else { if ((op->flags & BCH_WRITE_DATA_ENCODED) && bch2_rechecksum_bio(c, src, version, op->crc, @@ -996,8 +1053,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.live_size = src_len >> 9; swap(dst->bi_iter.bi_size, dst_len); - bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + crc.csum = bch2_checksum_bio(c, op->csum_type, extent_nonce(version, crc), dst); crc.csum_type = op->csum_type; @@ -1038,8 +1098,7 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while " - "rewriting existing data (memory corruption?)"); + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1055,7 +1114,7 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct write_point *wp; - struct bio *bio; + struct bio *bio = NULL; bool skip_put = true; unsigned nofs_flags; int ret; @@ -1080,12 +1139,6 @@ again: BKEY_EXTENT_U64s_MAX)) goto flush_io; - if ((op->flags & BCH_WRITE_FROM_INTERNAL) && - percpu_ref_is_dying(&c->writes)) { - ret = -EROFS; - goto err; - } - /* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that @@ -1104,8 +1157,8 @@ again: BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); - if (unlikely(IS_ERR(wp))) { - if (unlikely(PTR_ERR(wp) != -EAGAIN)) { + if (IS_ERR(wp)) { + if (unlikely(wp != ERR_PTR(-EAGAIN))) { ret = PTR_ERR(wp); goto err; } @@ -1279,11 +1332,12 @@ void bch2_write(struct closure *cl) } if (c->opts.nochanges || - !percpu_ref_tryget(&c->writes)) { + !percpu_ref_tryget_live(&c->writes)) { op->error = -EROFS; goto err; } + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1319,7 +1373,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; - struct migrate_write write; + struct data_update write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; @@ -1375,17 +1429,16 @@ static void promote_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); + bch2_data_update_exit(&op->write); promote_free(c, op); } static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) { - struct bch_fs *c = rbio->c; struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; - trace_promote(&rbio->bio); + trace_and_count(op->write.op.c, read_promote, &rbio->bio); /* we now own pages: */ BUG_ON(!rbio->bounce); @@ -1395,10 +1448,8 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - bch2_migrate_read_done(&op->write, rbio); - closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); + bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); closure_return_with_destructor(cl, promote_done); } @@ -1416,7 +1467,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return NULL; op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); @@ -1437,7 +1488,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, goto err; rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_NOIO)) @@ -1452,15 +1503,15 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, goto err; bio = &op->write.op.wbio.bio; - bio_init(bio, bio->bi_inline_vecs, pages); + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_migrate_write_init(c, &op->write, + ret = bch2_data_update_init(c, &op->write, writepoint_hashed((unsigned long) current), opts, - DATA_PROMOTE, - (struct data_opts) { + (struct data_update_opts) { .target = opts.promote_target, - .nr_replicas = 1, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); BUG_ON(ret); @@ -1653,7 +1704,7 @@ static void bch2_rbio_retry(struct work_struct *work) }; struct bch_io_failures failed = { .nr = 0 }; - trace_read_retry(&rbio->bio); + trace_and_count(c, read_retry, &rbio->bio); if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); @@ -1772,6 +1823,7 @@ static void __bch2_read_endio(struct work_struct *work) struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; struct bch_csum csum; + int ret; nofs_flags = memalloc_nofs_save(); @@ -1806,7 +1858,10 @@ static void __bch2_read_endio(struct work_struct *work) crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc_is_compressed(crc)) { - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; } else { @@ -1817,7 +1872,9 @@ static void __bch2_read_endio(struct work_struct *work) BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); src->bi_iter.bi_size = dst_iter.bi_size; - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -1830,7 +1887,10 @@ static void __bch2_read_endio(struct work_struct *work) * Re encrypt data we decrypted, so it's consistent with * rbio->crc: */ - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + promote_start(rbio->promote, rbio); rbio->promote = NULL; } @@ -1855,9 +1915,9 @@ csum_err: } bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, crc.csum_type); + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: @@ -1865,6 +1925,11 @@ decompression_err: "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); goto out; +decrypt_err: + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1895,7 +1960,7 @@ static void bch2_read_endio(struct bio *bio) if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ptr_stale(ca, &rbio->pick.ptr)) { - atomic_long_inc(&c->read_realloc_races); + trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); @@ -1905,6 +1970,7 @@ static void bch2_read_endio(struct bio *bio) } if (rbio->narrow_crcs || + rbio->promote || crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; @@ -1960,24 +2026,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); struct btree_iter iter; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret; - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf); - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), + PTR_BUCKET_POS(c, &ptr), BTREE_ITER_CACHED); + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (ret) - return; + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch_err(c, "%s", buf); - bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); } int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, @@ -2021,7 +2095,14 @@ retry_pick: ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!pick.ptr.cached && + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && unlikely(ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, k, pick.ptr); bch2_mark_io_failure(failed, &pick); @@ -2105,8 +2186,10 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, + rbio = rbio_init(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOIO, &c->bio_read_split), orig->opts); @@ -2122,8 +2205,8 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, - &c->bio_read_split), + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO, + &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; rbio->split = true; @@ -2165,8 +2248,9 @@ get_bio: rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_read_bounce(&rbio->bio); + trace_and_count(c, read_bounce, &rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* @@ -2179,7 +2263,7 @@ get_bio: if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + trace_and_count(c, read_split, &orig->bio); } if (!rbio->pick.idx) { @@ -2288,10 +2372,9 @@ retry: * read_extent -> io_time_reset may cause a transaction restart * without returning an error, we need to check for that here: */ - if (!bch2_trans_relock(&trans)) { - ret = -EINTR; + ret = bch2_trans_relock(&trans); + if (ret) break; - } bch2_btree_iter_set_pos(&iter, POS(inum.inum, bvec_iter.bi_sector)); @@ -2345,7 +2428,9 @@ retry: err: bch2_trans_iter_exit(&trans, &iter); - if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) goto retry; bch2_trans_exit(&trans); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 1aa422d..3ae3175 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -40,6 +40,7 @@ enum bch_write_flags { BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), BCH_WRITE_DONE = (1 << 12), + BCH_WRITE_IO_ERROR = (1 << 13), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -50,7 +51,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { - return op->alloc_reserve == RESERVE_MOVINGGC + return op->alloc_reserve == RESERVE_movinggc ? op->c->copygc_wq : op->c->btree_update_wq; } @@ -79,7 +80,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->compression_type = bch2_compression_opt_to_type[opts.compression]; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; - op->alloc_reserve = RESERVE_NONE; + op->alloc_reserve = RESERVE_none; op->incompressible = 0; op->open_buckets.nr = 0; op->devs_have.nr = 0; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 158df42..95c2922 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -15,23 +15,26 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "super-io.h" #include -static u64 last_unwritten_seq(struct journal *j) -{ - union journal_res_state s = READ_ONCE(j->reservations); +#define x(n) #n, +static const char * const bch2_journal_watermarks[] = { + JOURNAL_WATERMARKS() + NULL +}; - lockdep_assert_held(&j->lock); - - return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); -} +static const char * const bch2_journal_errors[] = { + JOURNAL_ERRORS() + NULL +}; +#undef x static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { - return seq >= last_unwritten_seq(j); + return seq > j->seq_ondisk; } static bool __journal_entry_is_open(union journal_res_state state) @@ -39,6 +42,11 @@ static bool __journal_entry_is_open(union journal_res_state state) return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; } +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + static bool journal_entry_is_open(struct journal *j) { return __journal_entry_is_open(j->reservations); @@ -50,8 +58,6 @@ journal_seq_to_buf(struct journal *j, u64 seq) struct journal_buf *buf = NULL; EBUG_ON(seq > journal_cur_seq(j)); - EBUG_ON(seq == journal_cur_seq(j) && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); if (journal_seq_unwritten(j, seq)) { buf = j->buf + (seq & JOURNAL_BUF_MASK); @@ -69,54 +75,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) p->devs.nr = 0; } -static void journal_pin_new_entry(struct journal *j) -{ - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - journal_pin_list_init(fifo_push_ref(&j->pin), 1); -} - -static void bch2_journal_buf_init(struct journal *j) -{ - struct journal_buf *buf = journal_cur_buf(j); - - bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(journal_cur_seq(j)); - buf->data->u64s = 0; -} - -void bch2_journal_halt(struct journal *j) -{ - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return; - - new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - /* - * XXX: we're not using j->lock here because this can be called from - * interrupt context, this can race with journal_write_done() - */ - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - closure_wake_up(&journal_cur_buf(j)->wait); -} - /* journal entry close/open: */ void __bch2_journal_buf_put(struct journal *j) @@ -132,7 +90,7 @@ void __bch2_journal_buf_put(struct journal *j) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static bool __journal_entry_close(struct journal *j) +static void __journal_entry_close(struct journal *j, unsigned closed_val) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -140,34 +98,24 @@ static bool __journal_entry_close(struct journal *j) u64 v = atomic64_read(&j->reservations.counter); unsigned sectors; + BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && + closed_val != JOURNAL_ENTRY_ERROR_VAL); + lockdep_assert_held(&j->lock); do { old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) - return true; + new.cur_entry_offset = closed_val; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { - /* this entry will never be written: */ - closure_wake_up(&buf->wait); - return true; - } - - if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { - set_bit(JOURNAL_NEED_WRITE, &j->flags); - j->need_write_time = local_clock(); - } - - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; - new.idx++; - - if (new.idx == new.unwritten_idx) - return false; - - BUG_ON(journal_state_count(new, new.idx)); + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || + old.cur_entry_offset == new.cur_entry_offset) + return; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + if (!__journal_entry_is_open(old)) + return; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -197,36 +145,42 @@ static bool __journal_entry_close(struct journal *j) */ buf->last_seq = journal_last_seq(j); buf->data->last_seq = cpu_to_le64(buf->last_seq); + BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); - /* Initialize new buffer: */ - journal_pin_new_entry(j); - - bch2_journal_buf_init(j); - cancel_delayed_work(&j->write_work); - clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); bch2_journal_buf_put(j, old.idx); - return true; +} + +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); } static bool journal_entry_want_write(struct journal *j) { - union journal_res_state s = READ_ONCE(j->reservations); - bool ret = false; + bool ret = !journal_entry_is_open(j) || + journal_cur_seq(j) == journal_last_unwritten_seq(j); - /* - * Don't close it yet if we already have a write in flight, but do set - * NEED_WRITE: - */ - if (s.idx != s.unwritten_idx) - set_bit(JOURNAL_NEED_WRITE, &j->flags); - else - ret = __journal_entry_close(j); + /* Don't close it yet if we already have a write in flight: */ + if (ret) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + } return ret; } @@ -255,34 +209,71 @@ static bool journal_entry_close(struct journal *j) static int journal_entry_open(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = journal_cur_buf(j); + struct journal_buf *buf = j->buf + + ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); union journal_res_state old, new; int u64s; u64 v; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return cur_entry_blocked; + return JOURNAL_ERR_blocked; if (j->cur_entry_error) return j->cur_entry_error; + if (bch2_journal_error(j)) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + + if (!fifo_free(&j->pin)) + return JOURNAL_ERR_journal_pin_full; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) + return JOURNAL_ERR_max_in_flight; + BUG_ON(!j->cur_entry_sectors); + buf->expires = + (journal_cur_seq(j) == j->flushed_seq_ondisk + ? jiffies + : j->last_flush_write) + + msecs_to_jiffies(c->opts.journal_flush_delay); + buf->u64s_reserved = j->entry_u64s_reserved; buf->disk_sectors = j->cur_entry_sectors; buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); u64s = (int) (buf->sectors << 9) / sizeof(u64) - journal_entry_overhead(j); - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - if (u64s <= le32_to_cpu(buf->data->u64s)) - return cur_entry_journal_full; + if (u64s <= 0) + return JOURNAL_ERR_journal_full; + + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); + + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); + + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); + buf->data->u64s = 0; /* * Must be set before marking the journal entry as open: @@ -293,14 +284,14 @@ static int journal_entry_open(struct journal *j) do { old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return cur_entry_insufficient_devices; + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + new.idx++; + BUG_ON(journal_state_count(new, new.idx)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); - EBUG_ON(journal_state_count(new, new.idx)); journal_state_inc(&new); + new.cur_entry_offset = 0; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -318,8 +309,7 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state s = READ_ONCE(j->reservations); - bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); + bool ret = atomic64_read(&j->seq) == j->seq_ondisk; if (!ret) journal_entry_close(j); @@ -334,8 +324,21 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + long delta; + + spin_lock(&j->lock); + if (!__journal_entry_is_open(j->reservations)) + goto unlock; + + delta = journal_cur_buf(j)->expires - jiffies; - journal_entry_close(j); + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); +unlock: + spin_unlock(&j->lock); } static int __journal_res_get(struct journal *j, struct journal_res *res, @@ -364,13 +367,12 @@ retry: return 0; } - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { /* * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = cur_entry_journal_full; + ret = JOURNAL_ERR_journal_full; goto unlock; } @@ -385,23 +387,16 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - if (journal_entry_is_open(j) && - !__journal_entry_close(j)) { - /* - * We failed to get a reservation on the current open journal - * entry because it's full, and we can't close it because - * there's still a previous one in flight: - */ - trace_journal_entry_full(c); - ret = cur_entry_blocked; - } else { - ret = journal_entry_open(j); - } + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + + if (ret == JOURNAL_ERR_max_in_flight) + trace_and_count(c, journal_entry_full, c); unlock: - if ((ret && ret != cur_entry_insufficient_devices) && + if ((ret && ret != JOURNAL_ERR_insufficient_devices) && !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; - trace_journal_full(c); + trace_and_count(c, journal_full, c); } can_discard = j->can_discard; @@ -410,23 +405,24 @@ unlock: if (!ret) goto retry; - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !can_discard && - j->reservations.idx == j->reservations.unwritten_idx && - (flags & JOURNAL_RES_GET_RESERVED)) { - char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); - - bch_err(c, "Journal stuck!"); - if (journal_debug_buf) { - bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); - bch_err(c, "%s", journal_debug_buf); - - bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); - bch_err(c, "Journal pins:\n%s", journal_debug_buf); - kfree(journal_debug_buf); - } + !nr_unwritten_journal_entries(j) && + (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { + struct printbuf buf = PRINTBUF; + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", + bch2_journal_errors[ret]); + + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + + printbuf_exit(&buf); bch2_fatal_error(c); dump_stack(); } @@ -435,8 +431,8 @@ unlock: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -449,7 +445,7 @@ unlock: } } - return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; + return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -528,7 +524,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -573,12 +569,15 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, } /* if seq was written, but not flushed - flush a newer one instead */ - seq = max(seq, last_unwritten_seq(j)); + seq = max(seq, journal_last_unwritten_seq(j)); recheck_need_open: - if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + if (seq > journal_cur_seq(j)) { struct journal_res res = { 0 }; + if (journal_entry_is_open(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + spin_unlock(&j->lock); ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); @@ -588,7 +587,11 @@ recheck_need_open: seq = res.seq; buf = j->buf + (seq & JOURNAL_BUF_MASK); buf->must_flush = true; - set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } if (parent && !closure_wait(&buf->wait, parent)) BUG(); @@ -640,69 +643,18 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) return ret ?: ret2 < 0 ? ret2 : 0; } -int bch2_journal_meta(struct journal *j) -{ - struct journal_buf *buf; - struct journal_res res; - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); - if (ret) - return ret; - - buf = j->buf + (res.seq & JOURNAL_BUF_MASK); - buf->must_flush = true; - set_bit(JOURNAL_NEED_WRITE, &j->flags); - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq); -} - /* * bch2_journal_flush_async - if there is an open journal entry, or a journal * still being written, write it and wait for the write to complete */ void bch2_journal_flush_async(struct journal *j, struct closure *parent) { - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return; - } - spin_unlock(&j->lock); - - bch2_journal_flush_seq_async(j, seq, parent); + bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); } int bch2_journal_flush(struct journal *j) { - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return 0; - } - spin_unlock(&j->lock); - - return bch2_journal_flush_seq(j, seq); + return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); } /* @@ -725,13 +677,13 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (seq <= c->journal.flushed_seq_ondisk) goto out; - for (unwritten_seq = last_unwritten_seq(j); + for (unwritten_seq = journal_last_unwritten_seq(j); unwritten_seq < seq; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush) + if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) goto out; buf->noflush = true; @@ -743,6 +695,64 @@ out: return ret; } +int bch2_journal_meta(struct journal *j) +{ + struct journal_buf *buf; + struct journal_res res; + int ret; + + memset(&res, 0, sizeof(res)); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + +int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) +{ + struct jset_entry_log *entry; + struct journal_res res = { 0 }; + unsigned msglen, u64s; + va_list args; + int ret; + + va_start(args, fmt); + msglen = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); + + ret = bch2_journal_res_get(j, &res, u64s, 0); + if (ret) + return ret; + + entry = container_of(journal_res_entry(j, &res), + struct jset_entry_log, entry); + memset(entry, 0, u64s * sizeof(u64)); + entry->entry.type = BCH_JSET_ENTRY_log; + entry->entry.u64s = u64s - 1; + + va_start(args, fmt); + vsnprintf(entry->d, INT_MAX, fmt, args); + va_end(args); + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) @@ -770,28 +780,55 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; u64 *new_bucket_seq = NULL, *new_buckets = NULL; + struct open_bucket **ob = NULL; + long *bu = NULL; + unsigned i, nr_got = 0, nr_want = nr - ja->nr; + unsigned old_nr = ja->nr; + unsigned old_discard_idx = ja->discard_idx; + unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; + unsigned old_dirty_idx = ja->dirty_idx; + unsigned old_cur_idx = ja->cur_idx; int ret = 0; - /* don't handle reducing nr of buckets yet: */ - if (nr <= ja->nr) - return 0; + if (c) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); + } - new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); - new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) { + bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); + ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); + new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); + new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); + if (!bu || !ob || !new_buckets || !new_bucket_seq) { ret = -ENOMEM; - goto err; + goto err_unblock; } - journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) { - ret = -ENOSPC; - goto err; + for (nr_got = 0; nr_got < nr_want; nr_got++) { + if (new_fs) { + bu[nr_got] = bch2_bucket_alloc_new_fs(ca); + if (bu[nr_got] < 0) { + ret = -BCH_ERR_ENOSPC_bucket_alloc; + break; + } + } else { + ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, + false, cl); + if (IS_ERR(ob[nr_got])) { + ret = cl + ? -EAGAIN + : -BCH_ERR_ENOSPC_bucket_alloc; + break; + } + + bu[nr_got] = ob[nr_got]->bucket; + } } + if (!nr_got) + goto err_unblock; + /* * We may be called from the device add path, before the new device has * actually been added to the running filesystem: @@ -804,51 +841,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); - if (!new_fs) - spin_unlock(&c->journal.lock); - - while (ja->nr < nr) { - struct open_bucket *ob = NULL; - unsigned pos; - long b; - - if (new_fs) { - b = bch2_bucket_alloc_new_fs(ca); - if (b < 0) { - ret = -ENOSPC; - goto err; - } - } else { - rcu_read_lock(); - ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, - false, cl); - rcu_read_unlock(); - if (IS_ERR(ob)) { - ret = cl ? -EAGAIN : -ENOSPC; - goto err; - } - - b = ob->bucket; - } - - if (c) - spin_lock(&c->journal.lock); - - /* - * XXX - * For resize at runtime, we should be writing the new - * superblock before inserting into the journal array - */ + for (i = 0; i < nr_got; i++) { + unsigned pos = ja->discard_idx ?: ja->nr; + long b = bu[i]; - pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; __array_insert_item(ja->buckets, ja->nr, pos); __array_insert_item(ja->bucket_seq, ja->nr, pos); - __array_insert_item(journal_buckets->buckets, ja->nr, pos); ja->nr++; ja->buckets[pos] = b; ja->bucket_seq[pos] = 0; - journal_buckets->buckets[pos] = cpu_to_le64(b); if (pos <= ja->discard_idx) ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -858,29 +860,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + } - if (c) - spin_unlock(&c->journal.lock); + ret = bch2_journal_buckets_to_sb(c, ca); + if (ret) { + /* Revert: */ + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = old_nr; + ja->discard_idx = old_discard_idx; + ja->dirty_idx_ondisk = old_dirty_idx_ondisk; + ja->dirty_idx = old_dirty_idx; + ja->cur_idx = old_cur_idx; + } - if (!new_fs) { - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_trans_mark_metadata_bucket(&trans, ca, - b, BCH_DATA_journal, - ca->mi.bucket_size)); + if (!new_fs) + spin_unlock(&c->journal.lock); - bch2_open_bucket_put(c, ob); + if (c) + bch2_journal_unblock(&c->journal); + + if (ret) + goto err; - if (ret) + if (!new_fs) { + for (i = 0; i < nr_got; i++) { + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + bu[i], BCH_DATA_journal, + ca->mi.bucket_size)); + if (ret) { + bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); goto err; + } } } err: - bch2_sb_resize_journal(&ca->disk_sb, - ja->nr + sizeof(*journal_buckets) / sizeof(u64)); + if (ob && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); + kfree(new_bucket_seq); kfree(new_buckets); + kfree(ob); + kfree(bu); return ret; +err_unblock: + if (c) + bch2_journal_unblock(&c->journal); + goto err; } /* @@ -893,11 +922,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, struct journal_device *ja = &ca->journal; struct closure cl; unsigned current_nr; - int ret; + int ret = 0; + + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + return 0; closure_init_stack(&cl); - do { + while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { struct disk_reservation disk_res = { 0, 0 }; closure_sync(&cl); @@ -912,10 +945,11 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * reservation to ensure we'll actually be able to allocate: */ - if (bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0)) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) { mutex_unlock(&c->sb_lock); - return -ENOSPC; + return ret; } ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); @@ -925,7 +959,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, if (ja->nr != current_nr) bch2_write_super(c); mutex_unlock(&c->sb_lock); - } while (ret == -EAGAIN); + } return ret; } @@ -933,6 +967,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_journal_alloc(struct bch_dev *ca) { unsigned nr; + int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) return -ENOMEM; @@ -949,24 +984,31 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + if (ca->fs) + mutex_lock(&ca->fs->sb_lock); + + ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + + if (ca->fs) + mutex_unlock(&ca->fs->sb_lock); + + return ret; } /* startup/shutdown: */ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { - union journal_res_state state; bool ret = false; - unsigned i; + u64 seq; spin_lock(&j->lock); - state = READ_ONCE(j->reservations); - i = state.idx; + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j) && !ret; + seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, seq); - while (i != state.unwritten_idx) { - i = (i - 1) & JOURNAL_BUF_MASK; - if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) ret = true; } spin_unlock(&j->lock); @@ -981,6 +1023,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); wait_event(j->wait, journal_entry_close(j)); @@ -995,24 +1038,30 @@ void bch2_fs_journal_stop(struct journal *j) BUG_ON(!bch2_journal_error(j) && test_bit(JOURNAL_REPLAY_DONE, &j->flags) && - (journal_entry_is_open(j) || - j->last_empty_seq + 1 != journal_cur_seq(j))); + j->last_empty_seq != journal_cur_seq(j)); cancel_delayed_work_sync(&j->write_work); - bch2_journal_reclaim_stop(j); } -int bch2_fs_journal_start(struct journal *j, u64 cur_seq, - struct list_head *journal_entries) +int bch2_fs_journal_start(struct journal *j, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; - struct journal_replay *i; + struct journal_replay *i, **_i; + struct genradix_iter iter; + bool had_entries = false; + unsigned ptr; u64 last_seq = cur_seq, nr, seq; - if (!list_empty(journal_entries)) - last_seq = le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); + genradix_for_each_reverse(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + last_seq = le64_to_cpu(i->j.last_seq); + break; + } nr = cur_seq - last_seq; @@ -1029,18 +1078,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); - if (list_empty(journal_entries)) - j->last_empty_seq = cur_seq - 1; - fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); - list_for_each_entry(i, journal_entries, list) { - unsigned ptr; + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; seq = le64_to_cpu(i->j.seq); BUG_ON(seq >= cur_seq); @@ -1056,9 +1106,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, p->devs.nr = 0; for (ptr = 0; ptr < i->nr_ptrs; ptr++) bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + + had_entries = true; } - if (list_empty(journal_entries)) + if (!had_entries) j->last_empty_seq = cur_seq; spin_lock(&j->lock); @@ -1066,11 +1118,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, set_bit(JOURNAL_STARTED, &j->flags); j->last_flush_write = jiffies; - journal_pin_new_entry(j); - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - - bch2_journal_buf_init(j); + j->reservations.unwritten_idx++; c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1098,25 +1147,49 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(sb); - unsigned i; + struct bch_sb_field_journal_v2 *journal_buckets_v2 = + bch2_sb_get_journal_v2(sb); + unsigned i, nr_bvecs; + + ja->nr = 0; - ja->nr = bch2_nr_journal_buckets(journal_buckets); + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + + for (i = 0; i < nr; i++) + ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); + } else if (journal_buckets) { + ja->nr = bch2_nr_journal_buckets(journal_buckets); + } ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) return -ENOMEM; - ca->journal.bio = bio_kmalloc(GFP_KERNEL, - DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); + nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + + ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!ca->journal.bio) return -ENOMEM; + bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) return -ENOMEM; - for (i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + unsigned j, dst = 0; + + for (i = 0; i < nr; i++) + for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + ja->buckets[dst++] = + le64_to_cpu(journal_buckets_v2->d[i].start) + j; + } else if (journal_buckets) { + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + } return 0; } @@ -1182,68 +1255,94 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) union journal_res_state s; struct bch_dev *ca; unsigned long now = jiffies; + u64 seq; unsigned i; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + out->atomic++; + rcu_read_lock(); s = READ_ONCE(j->reservations); - pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin)); - pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); - pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); - pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); - pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); - pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); - pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); + prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); + prt_printf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); + prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); - pr_buf(out, "current entry:\t\t"); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - pr_buf(out, "error\n"); + prt_printf(out, "error"); break; case JOURNAL_ENTRY_CLOSED_VAL: - pr_buf(out, "closed\n"); + prt_printf(out, "closed"); break; default: - pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); + prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); break; } - pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx)); + prt_newline(out); + + for (seq = journal_cur_seq(j); + seq >= journal_last_unwritten_seq(j); + --seq) { + i = seq & JOURNAL_BUF_MASK; + + prt_printf(out, "unwritten entry:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); - i = s.idx; - while (i != s.unwritten_idx) { - i = (i - 1) & JOURNAL_BUF_MASK; + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); - pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", - i, journal_state_count(s, i), j->buf[i].sectors); + prt_printf(out, "sectors:"); + prt_tab(out); + prt_printf(out, "%u", j->buf[i].sectors); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); } - pr_buf(out, - "need write:\t\t%i\n" + prt_printf(out, "replay done:\t\t%i\n", - test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - pr_buf(out, "space:\n"); - pr_buf(out, "\tdiscarded\t%u:%u\n", + prt_printf(out, "space:\n"); + prt_printf(out, "\tdiscarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); - pr_buf(out, "\tclean ondisk\t%u:%u\n", + prt_printf(out, "\tclean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); - pr_buf(out, "\tclean\t\t%u:%u\n", + prt_printf(out, "\tclean\t\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); - pr_buf(out, "\ttotal\t\t%u:%u\n", + prt_printf(out, "\ttotal\t\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); @@ -1257,17 +1356,19 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - pr_buf(out, "dev %u:\n", i); - pr_buf(out, "\tnr\t\t%u\n", ja->nr); - pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); - pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); - pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + prt_printf(out, "dev %u:\n", i); + prt_printf(out, "\tnr\t\t%u\n", ja->nr); + prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } rcu_read_unlock(); + + --out->atomic; } void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) @@ -1277,27 +1378,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) spin_unlock(&j->lock); } -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - u64 i; spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - pr_buf(out, "%llu: count %u\n", - i, atomic_read(&pin_list->count)); + *seq = max(*seq, j->pin.front); - list_for_each_entry(pin, &pin_list->list, list) - pr_buf(out, "\t%px %ps\n", - pin, pin->flush); + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); + prt_newline(out); + printbuf_indent_add(out, 2); - if (!list_empty(&pin_list->flushed)) - pr_buf(out, "flushed:\n"); + list_for_each_entry(pin, &pin_list->list, list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + list_for_each_entry(pin, &pin_list->key_cache_list, list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + if (!list_empty(&pin_list->flushed)) { + prt_printf(out, "flushed:"); + prt_newline(out); + } - list_for_each_entry(pin, &pin_list->flushed, list) - pr_buf(out, "\t%px %ps\n", - pin, pin->flush); + list_for_each_entry(pin, &pin_list->flushed, list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); } + + printbuf_indent_sub(out, 2); + + --out->atomic; spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index b298873..9428f42 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -110,6 +110,7 @@ */ #include +#include #include "journal_types.h" @@ -141,7 +142,10 @@ static inline u64 journal_cur_seq(struct journal *j) return j->pin.back - 1; } -void bch2_journal_set_has_inum(struct journal *, u64, u64); +static inline u64 journal_last_unwritten_seq(struct journal *j) +{ + return j->seq_ondisk + 1; +} static inline int journal_state_count(union journal_res_state s, int idx) { @@ -196,9 +200,9 @@ journal_res_entry(struct journal *j, struct journal_res *res) return vstruct_idx(j->buf[res->idx].data, res->offset); } -static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, +static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, enum btree_id id, unsigned level, - const void *data, unsigned u64s) + unsigned u64s) { entry->u64s = cpu_to_le16(u64s); entry->btree_id = id; @@ -207,32 +211,33 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type entry->pad[0] = 0; entry->pad[1] = 0; entry->pad[2] = 0; - memcpy_u64s_small(entry->_data, data, u64s); - return jset_u64s(u64s); } -static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, - unsigned type, enum btree_id id, - unsigned level, +static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, const void *data, unsigned u64s) { - unsigned actual = journal_entry_set(journal_res_entry(j, res), - type, id, level, data, u64s); + unsigned ret = journal_entry_init(entry, type, id, level, u64s); + + memcpy_u64s_small(entry->_data, data, u64s); + return ret; +} + +static inline struct jset_entry * +bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, unsigned u64s) +{ + struct jset_entry *entry = journal_res_entry(j, res); + unsigned actual = journal_entry_init(entry, type, id, level, u64s); EBUG_ON(!res->ref); EBUG_ON(actual > res->u64s); res->offset += actual; res->u64s -= actual; -} - -static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, - enum btree_id id, unsigned level, - const struct bkey_i *k) -{ - bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, - id, level, k, k->k.u64s); + return entry; } static inline bool journal_entry_empty(struct jset *j) @@ -261,9 +266,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) .buf3_count = idx == 3, }).v, &j->reservations.counter); - EBUG_ON(((s.idx - idx) & 3) > - ((s.idx - s.unwritten_idx) & 3)); - if (!journal_state_count(s, idx) && idx == s.unwritten_idx) __bch2_journal_buf_put(j); } @@ -283,7 +285,7 @@ static inline void bch2_journal_res_put(struct journal *j, while (res->u64s) bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, - 0, 0, NULL, 0); + 0, 0, 0); bch2_journal_buf_put(j, res->idx); @@ -293,9 +295,9 @@ static inline void bch2_journal_res_put(struct journal *j, int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned); -#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -#define JOURNAL_RES_GET_CHECK (1 << 1) -#define JOURNAL_RES_GET_RESERVED (1 << 2) +/* First two bits for JOURNAL_WATERMARK: */ +#define JOURNAL_RES_GET_NONBLOCK (1 << 2) +#define JOURNAL_RES_GET_CHECK (1 << 3) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -303,24 +305,34 @@ static inline int journal_res_get_fast(struct journal *j, { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); + unsigned u64s, offset; do { old.v = new.v = v; + /* + * Round up the end of the journal reservation to the next + * cacheline boundary: + */ + u64s = res->u64s; + offset = sizeof(struct jset) / sizeof(u64) + + new.cur_entry_offset + u64s; + u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1; + + /* * Check if there is still room in the current journal * entry: */ - if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) + if (new.cur_entry_offset + u64s > j->cur_entry_u64s) return 0; EBUG_ON(!journal_state_count(new, new.idx)); - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) return 0; - new.cur_entry_offset += res->u64s; + new.cur_entry_offset += u64s; journal_state_inc(&new); /* @@ -337,8 +349,15 @@ static inline int journal_res_get_fast(struct journal *j, res->ref = true; res->idx = old.idx; + res->u64s = u64s; res->offset = old.cur_entry_offset; res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + + offset = res->offset; + while (offset < res->offset + res->u64s) { + prefetchw(vstruct_idx(j->buf[res->idx].data, offset)); + offset += SMP_CACHE_BYTES / sizeof(u64); + } return 1; } @@ -370,23 +389,27 @@ out: /* journal_preres: */ -static inline bool journal_check_may_get_unreserved(struct journal *j) +static inline void journal_set_watermark(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - bool ret = s.reserved < s.remaining && - fifo_free(&j->pin) > 8; - - lockdep_assert_held(&j->lock); - - if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - if (ret) { - set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - journal_wake(j); - } else { - clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - } - } - return ret; + unsigned watermark = JOURNAL_WATERMARK_any; + + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (fifo_free(&j->pin) < j->pin.size / 8) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (s.reserved > s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (!s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static inline void bch2_journal_preres_put(struct journal *j, @@ -406,12 +429,8 @@ static inline void bch2_journal_preres_put(struct journal *j, closure_wake_up(&j->preres_wait); } - if (s.reserved <= s.remaining && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - spin_lock(&j->lock); - journal_check_may_get_unreserved(j); - spin_unlock(&j->lock); - } + if (s.reserved <= s.remaining && j->watermark) + journal_set_watermark(j); } int __bch2_journal_preres_get(struct journal *, @@ -432,8 +451,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, old.v = new.v = v; ret = 0; - if ((flags & JOURNAL_RES_GET_RESERVED) || - test_bit(JOURNAL_NOCHANGES, &j->flags) || + if ((flags & JOURNAL_WATERMARK_reserved) || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; @@ -479,6 +497,7 @@ int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); +int bch2_journal_log_msg(struct journal *, const char *, ...); void bch2_journal_halt(struct journal *); @@ -502,6 +521,7 @@ void bch2_journal_block(struct journal *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); @@ -510,7 +530,7 @@ int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64, struct list_head *); +int bch2_fs_journal_start(struct journal *, u64); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index b5c204e..c4922c6 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" @@ -16,12 +17,39 @@ #include -static void __journal_replay_free(struct journal_replay *i) +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +static bool jset_csum_good(struct bch_fs *c, struct jset *j) +{ + return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && + !bch2_crc_cmp(j->csum, + csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); +} + +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) { - list_del(&i->list); + return (seq - c->journal_entries_base_seq) & (~0U >> 1); +} + +static void __journal_replay_free(struct bch_fs *c, + struct journal_replay *i) +{ + struct journal_replay **p = + genradix_ptr(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); + + BUG_ON(*p != i); + *p = NULL; kvpfree(i, offsetof(struct journal_replay, j) + vstruct_bytes(&i->j)); - } static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) @@ -29,13 +57,13 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) i->ignore = true; if (!c->opts.read_entire_journal) - __journal_replay_free(i); + __journal_replay_free(c, i); } struct journal_list { struct closure cl; + u64 last_seq; struct mutex lock; - struct list_head *head; int ret; }; @@ -47,94 +75,105 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct bch_extent_ptr entry_ptr, - struct journal_list *jlist, struct jset *j, - bool bad) + struct journal_ptr entry_ptr, + struct journal_list *jlist, struct jset *j) { - struct journal_replay *i, *pos, *dup = NULL; - struct bch_extent_ptr *ptr; - struct list_head *where; + struct genradix_iter iter; + struct journal_replay **_i, *i, *dup; + struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); - u64 last_seq = 0; + u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; int ret = JOURNAL_ENTRY_ADD_OK; - list_for_each_entry_reverse(i, jlist->head, list) { - if (!JSET_NO_FLUSH(&i->j)) { - last_seq = le64_to_cpu(i->j.last_seq); - break; - } - } - /* Is this entry older than the range we need? */ if (!c->opts.read_entire_journal && - le64_to_cpu(j->seq) < last_seq) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; - } + le64_to_cpu(j->seq) < jlist->last_seq) + return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + + /* + * genradixes are indexed by a ulong, not a u64, so we can't index them + * by sequence number directly: Assume instead that they will all fall + * within the range of +-2billion of the filrst one we find. + */ + if (!c->journal_entries_base_seq) + c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); /* Drop entries we don't need anymore */ - if (!JSET_NO_FLUSH(j)) { - list_for_each_entry_safe(i, pos, jlist->head, list) { - if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) + if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { + genradix_for_each_from(&c->journal_entries, iter, _i, + journal_entry_radix_idx(c, jlist->last_seq)) { + i = *_i; + + if (!i || i->ignore) + continue; + + if (le64_to_cpu(i->j.seq) >= last_seq) break; journal_replay_free(c, i); } } - list_for_each_entry_reverse(i, jlist->head, list) { - if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { - where = &i->list; - goto add; - } - } - - where = jlist->head; -add: - dup = where->next != jlist->head - ? container_of(where->next, struct journal_replay, list) - : NULL; + jlist->last_seq = max(jlist->last_seq, last_seq); - if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq)) - dup = NULL; + _i = genradix_ptr_alloc(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(j->seq)), + GFP_KERNEL); + if (!_i) + return -ENOMEM; /* * Duplicate journal entries? If so we want the one that didn't have a * checksum error: */ + dup = *_i; if (dup) { - if (dup->bad) { - /* we'll replace @dup: */ - } else if (bad) { + if (bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes)) { i = dup; goto found; - } else { - fsck_err_on(bytes != vstruct_bytes(&dup->j) || - memcmp(j, &dup->j, bytes), c, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); + } + + if (!entry_ptr.csum_good) { i = dup; goto found; } - } - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) { - ret = -ENOMEM; - goto out; + if (!dup->csum_good) + goto replace; + + fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + i = dup; + goto found; } +replace: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) + return -ENOMEM; - i->nr_ptrs = 0; - i->bad = bad; + i->nr_ptrs = 0; + i->csum_good = entry_ptr.csum_good; i->ignore = false; memcpy(&i->j, j, bytes); + i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - i->nr_ptrs = dup->nr_ptrs; - memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); - __journal_replay_free(dup); + if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; + } + + /* The first ptr should represent the jset we kept: */ + memcpy(i->ptrs + i->nr_ptrs, + dup->ptrs, + sizeof(dup->ptrs[0]) * dup->nr_ptrs); + i->nr_ptrs += dup->nr_ptrs; + __journal_replay_free(c, dup); } - list_add(&i->list, where); + *_i = i; + return 0; found: for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { if (ptr->dev == ca->dev_idx) { @@ -156,16 +195,6 @@ fsck_err: return ret; } -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - /* this fills in a range with empty jset_entries: */ static void journal_entry_null_range(void *start, void *end) { @@ -179,66 +208,84 @@ static void journal_entry_null_range(void *start, void *end) #define JOURNAL_ENTRY_NONE 6 #define JOURNAL_ENTRY_BAD 7 -#define journal_entry_err(c, msg, ...) \ +static void journal_entry_err_msg(struct printbuf *out, + struct jset *jset, + struct jset_entry *entry) +{ + prt_str(out, "invalid journal entry "); + if (entry) + prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); + + if (!jset) + prt_printf(out, "in superblock"); + else if (!entry) + prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); + else + prt_printf(out, "at offset %zi/%u seq %llu", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); + prt_str(out, ": "); +} + +#define journal_entry_err(c, jset, entry, msg, ...) \ ({ \ + struct printbuf buf = PRINTBUF; \ + \ + journal_entry_err_msg(&buf, jset, entry); \ + prt_printf(&buf, msg, ##__VA_ARGS__); \ + \ switch (write) { \ case READ: \ - mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ + mustfix_fsck_err(c, "%s", buf.buf); \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write:\n" \ - msg, ##__VA_ARGS__); \ + bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ if (bch2_fs_inconsistent(c)) { \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ break; \ } \ + \ + printbuf_exit(&buf); \ true; \ }) -#define journal_entry_err_on(cond, c, msg, ...) \ - ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) +#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ + ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) #define FSCK_DELETED_KEY 5 -static int journal_validate_key(struct bch_fs *c, const char *where, +static int journal_validate_key(struct bch_fs *c, + struct jset *jset, struct jset_entry *entry, unsigned level, enum btree_id btree_id, - struct bkey_i *k, const char *type, + struct bkey_i *k, unsigned version, int big_endian, int write) { void *next = vstruct_next(entry); - const char *invalid; + struct printbuf buf = PRINTBUF; int ret = 0; - if (journal_entry_err_on(!k->k.u64s, c, - "invalid %s in %s entry offset %zi/%u: k->u64s 0", - type, where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s))) { + if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; } if (journal_entry_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), c, - "invalid %s in %s entry offset %zi/%u: extends past end of journal entry", - type, where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s))) { + (void *) vstruct_next(entry), + c, jset, entry, + "extends past end of journal entry")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; } - if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in %s entry offset %zi/%u: bad format %u", - type, where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s), - k->k.format)) { + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, + c, jset, entry, + "bad format %u", k->k.format)) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); @@ -249,21 +296,29 @@ static int journal_validate_key(struct bch_fs *c, const char *where, bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); - invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id)); - if (invalid) { - char buf[160]; + if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", + bch2_jset_entry_types[entry->type], + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + prt_newline(&buf); + bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", - type, where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s), - invalid, buf); + mustfix_fsck_err(c, "%s", buf.buf); le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); + + printbuf_exit(&buf); return FSCK_DELETED_KEY; } @@ -271,21 +326,22 @@ static int journal_validate_key(struct bch_fs *c, const char *where, bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: + printbuf_exit(&buf); return ret; } static int journal_entry_btree_keys_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { struct bkey_i *k = entry->start; while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, where, entry, + int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, - k, "key", version, big_endian, write); + k, version, big_endian, write); if (ret == FSCK_DELETED_KEY) continue; @@ -303,17 +359,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs vstruct_for_each(entry, k) { if (!first) { - printbuf_newline(out); - pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_newline(out); + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); } - pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } } static int journal_entry_btree_root_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -321,7 +377,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, c, + le16_to_cpu(entry->u64s) != k->k.u64s, + c, jset, entry, "invalid btree root journal entry: wrong number of keys")) { void *next = vstruct_next(entry); /* @@ -334,8 +391,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - return journal_validate_key(c, where, entry, 1, entry->btree_id, k, - "btree root", version, big_endian, write); + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, + version, big_endian, write); fsck_err: return ret; } @@ -347,7 +404,7 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -361,13 +418,14 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { int ret = 0; - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, + c, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -381,18 +439,19 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs struct jset_entry_blacklist *bl = container_of(entry, struct jset_entry_blacklist, entry); - pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq)); + prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); } static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, + c, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); goto out; @@ -401,7 +460,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > - le64_to_cpu(bl_entry->end), c, + le64_to_cpu(bl_entry->end), + c, jset, entry, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -416,13 +476,13 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_ struct jset_entry_blacklist_v2 *bl = container_of(entry, struct jset_entry_blacklist_v2, entry); - pr_buf(out, "start=%llu end=%llu", + prt_printf(out, "start=%llu end=%llu", le64_to_cpu(bl->start), le64_to_cpu(bl->end)); } static int journal_entry_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -432,7 +492,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u), - c, + c, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -448,13 +508,13 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - pr_buf(out, "type=%s v=%llu", + prt_printf(out, "type=%s v=%llu", bch2_fs_usage_types[u->entry.btree_id], le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -465,7 +525,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u) || bytes < sizeof(*u) + u->r.nr_devs, - c, + c, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -482,11 +542,11 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_data_usage, entry); bch2_replicas_entry_to_text(out, &u->r); - pr_buf(out, "=%llu", le64_to_cpu(u->v)); + prt_printf(out, "=%llu", le64_to_cpu(u->v)); } static int journal_entry_clock_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -496,13 +556,13 @@ static int journal_entry_clock_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes != sizeof(*clock), - c, "invalid journal entry clock: bad size")) { + c, jset, entry, "bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(clock->rw > 1, - c, "invalid journal entry clock: bad rw")) { + c, jset, entry, "bad rw")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -517,11 +577,11 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); - pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); + prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); } static int journal_entry_dev_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -533,7 +593,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < expected, - c, "invalid journal entry dev usage: bad size (%u < %u)", + c, jset, entry, "bad size (%u < %u)", bytes, expected)) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -542,13 +602,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, dev = le32_to_cpu(u->dev); if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, "invalid journal entry dev usage: bad dev")) { + c, jset, entry, "bad dev")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(u->pad, - c, "invalid journal entry dev usage: bad pad")) { + c, jset, entry, "bad pad")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -564,26 +624,24 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_dev_usage, entry); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - pr_buf(out, "dev=%u", le32_to_cpu(u->dev)); + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { if (i < BCH_DATA_NR) - pr_buf(out, " %s", bch2_data_types[i]); + prt_printf(out, " %s", bch2_data_types[i]); else - pr_buf(out, " (unknown data type %u)", i); - pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu", + prt_printf(out, " (unknown data type %u)", i); + prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } - pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu", - le64_to_cpu(u->buckets_ec), - le64_to_cpu(u->buckets_unavailable)); + prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); } static int journal_entry_log_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -596,11 +654,25 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); - bch_scnmemcpy(out, l->d, strnlen(l->d, bytes)); + prt_printf(out, "%.*s", bytes, l->d); +} + +static int journal_entry_overwrite_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write); +} + +static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); } struct jset_entry_ops { - int (*validate)(struct bch_fs *, const char *, + int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, int); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -615,12 +687,13 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { #undef x }; -int bch2_journal_entry_validate(struct bch_fs *c, const char *where, +int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, where, entry, + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, version, big_endian, write) : 0; } @@ -629,34 +702,28 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { if (entry->type < BCH_JSET_ENTRY_NR) { - pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); } else { - pr_buf(out, "(unknown type %u)", entry->type); + prt_printf(out, "(unknown type %u)", entry->type); } } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, int write) { - char buf[100]; struct jset_entry *entry; int ret = 0; vstruct_for_each(jset, entry) { - scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u", - le64_to_cpu(jset->seq), - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s)); - if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(jset), c, + vstruct_last(jset), c, jset, entry, "journal entry extends past end of jset")) { jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; } - ret = bch2_journal_entry_validate(c, buf, entry, + ret = bch2_journal_entry_validate(c, jset, entry, le32_to_cpu(jset->version), JSET_BIG_ENDIAN(jset), write); if (ret) @@ -669,12 +736,8 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read, int write) { - size_t bytes = vstruct_bytes(jset); - struct bch_csum csum; unsigned version; int ret = 0; @@ -684,70 +747,80 @@ static int jset_validate(struct bch_fs *c, version = le32_to_cpu(jset->version); if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, c, + version >= bcachefs_metadata_version_max, + c, jset, NULL, "%s sector %llu seq %llu: unknown journal entry version %u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), version)) { /* don't try to continue: */ - return EINVAL; + return -EINVAL; } - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, - "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), bytes)) { - ret = JOURNAL_ENTRY_BAD; - le32_add_cpu(&jset->u64s, - -((bytes - (bucket_sectors_left << 9)) / 8)); - } - - if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), + c, jset, NULL, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) { + JSET_CSUM_TYPE(jset))) ret = JOURNAL_ENTRY_BAD; - goto csum_done; - } - if (write) - goto csum_done; - - csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, - "%s sector %llu seq %llu: journal checksum bad", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq))) - ret = JOURNAL_ENTRY_BAD; - - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); -csum_done: /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && - le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), + c, jset, NULL, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(jset->last_seq), le64_to_cpu(jset->seq))) { jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } + + ret = jset_validate_entries(c, jset, write); fsck_err: return ret; } -static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) +static int jset_validate_early(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read) { - unsigned sectors = vstruct_sectors(jset, c->block_bits); + size_t bytes = vstruct_bytes(jset); + unsigned version; + int write = READ; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; - return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: - jset_validate_entries(c, jset, WRITE); + version = le32_to_cpu(jset->version); + if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + c, jset, NULL, + "%s sector %llu seq %llu: unknown journal entry version %u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + version)) { + /* don't try to continue: */ + return -EINVAL; + } + + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, + c, jset, NULL, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); +fsck_err: + return ret; } struct journal_read_buf { @@ -786,7 +859,7 @@ static int journal_read_bucket(struct bch_dev *ca, unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; - bool saw_bad = false; + bool saw_bad = false, csum_good; int ret = 0; pr_debug("reading %u", bucket); @@ -794,20 +867,20 @@ static int journal_read_bucket(struct bch_dev *ca, while (offset < end) { if (!sectors_read) { struct bio *bio; + unsigned nr_bvecs; reread: sectors_read = min_t(unsigned, end - offset, buf->size >> 9); + nr_bvecs = buf_pages(buf->data, sectors_read << 9); - bio = bio_kmalloc(GFP_KERNEL, - buf_pages(buf->data, - sectors_read << 9)); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + + bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); ret = submit_bio_wait(bio); - bio_put(bio); + kfree(bio); if (bch2_dev_io_err_on(ret, ca, "journal read error: sector %llu", @@ -825,11 +898,10 @@ reread: j = buf->data; } - ret = jset_validate(c, ca, j, offset, - end - offset, sectors_read, - READ); + ret = jset_validate_early(c, ca, j, offset, + end - offset, sectors_read); switch (ret) { - case BCH_FSCK_OK: + case 0: sectors = vstruct_sectors(j, c->block_bits); break; case JOURNAL_ENTRY_REREAD: @@ -843,17 +915,13 @@ reread: case JOURNAL_ENTRY_NONE: if (!saw_bad) return 0; - sectors = block_sectors(c); - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading * again at next block boundary: */ sectors = block_sectors(c); - break; + goto next_block; default: return ret; } @@ -869,11 +937,25 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + csum_good = jset_csum_good(c, j); + if (!csum_good) + saw_bad = true; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret); + mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { - .dev = ca->dev_idx, - .offset = offset, - }, jlist, j, ret != 0); + ret = journal_entry_add(c, ca, (struct journal_ptr) { + .csum_good = csum_good, + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, + }, jlist, j); mutex_unlock(&jlist->lock); switch (ret) { @@ -902,6 +984,8 @@ static void bch2_journal_read_device(struct closure *cl) struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); + struct journal_replay *r, **_r; + struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; u64 min_seq = U64_MAX; unsigned i; @@ -937,11 +1021,42 @@ static void bch2_journal_read_device(struct closure *cl) * allocate */ while (ja->bucket_seq[ja->cur_idx] > min_seq && - ja->bucket_seq[ja->cur_idx] > + ja->bucket_seq[ja->cur_idx] == ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = 0; + ja->sectors_free = ca->mi.bucket_size; + + mutex_lock(&jlist->lock); + genradix_for_each(&c->journal_entries, iter, _r) { + r = *_r; + + if (!r) + continue; + + for (i = 0; i < r->nr_ptrs; i++) { + if (r->ptrs[i].dev == ca->dev_idx && + sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { + unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + vstruct_sectors(&r->j, c->block_bits); + + ja->sectors_free = min(ja->sectors_free, + ca->mi.bucket_size - wrote); + } + } + } + mutex_unlock(&jlist->lock); + + if (ja->bucket_seq[ja->cur_idx] && + ja->sectors_free == ca->mi.bucket_size) { + bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); + bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); + for (i = 0; i < 3; i++) { + unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); + } + ja->sectors_free = 0; + } /* * Set dirty_idx to indicate the entire journal is full and needs to be @@ -963,8 +1078,8 @@ err: goto out; } -static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) { unsigned i; @@ -972,23 +1087,26 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); u64 offset; - div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); + div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); if (i) - pr_buf(out, " "); - pr_buf(out, "%u:%llu (offset %llu)", + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", j->ptrs[i].dev, - (u64) j->ptrs[i].offset, offset); + j->ptrs[i].bucket, + j->ptrs[i].bucket_offset, + j->ptrs[i].sector); } } -int bch2_journal_read(struct bch_fs *c, struct list_head *list, - u64 *blacklist_seq, u64 *start_seq) +int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, *t; + struct journal_replay *i, **_i, *prev = NULL; + struct genradix_iter radix_iter; struct bch_dev *ca; unsigned iter; + struct printbuf buf = PRINTBUF; size_t keys = 0, entries = 0; bool degraded = false; u64 seq, last_seq = 0; @@ -996,11 +1114,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, closure_init_stack(&jlist.cl); mutex_init(&jlist.lock); - jlist.head = list; + jlist.last_seq = 0; jlist.ret = 0; for_each_member_device(ca, c, iter) { - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + if (!c->opts.fsck && !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; @@ -1020,23 +1138,30 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (jlist.ret) return jlist.ret; - if (list_empty(list)) { - bch_info(c, "journal read done, but no entries found"); - return 0; - } - - i = list_last_entry(list, struct journal_replay, list); - *start_seq = le64_to_cpu(i->j.seq) + 1; + *start_seq = 0; /* * Find most recent flush entry, and ignore newer non flush entries - * those entries will be blacklisted: */ - list_for_each_entry_safe_reverse(i, t, list, list) { - if (i->ignore) + genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) continue; + if (!*start_seq) + *start_seq = le64_to_cpu(i->j.seq) + 1; + if (!JSET_NO_FLUSH(&i->j)) { + int write = READ; + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, &i->j, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) + i->j.last_seq = i->j.seq; + last_seq = le64_to_cpu(i->j.last_seq); *blacklist_seq = le64_to_cpu(i->j.seq) + 1; break; @@ -1045,14 +1170,22 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, journal_replay_free(c, i); } + if (!*start_seq) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + if (!last_seq) { fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); - return -1; + ret = -1; + goto err; } /* Drop blacklisted entries and entries older than last_seq: */ - list_for_each_entry_safe(i, t, list, list) { - if (i->ignore) + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) continue; seq = le64_to_cpu(i->j.seq); @@ -1071,15 +1204,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, /* Check for missing entries: */ seq = last_seq; - list_for_each_entry(i, list, list) { - if (i->ignore) + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); while (seq < le64_to_cpu(i->j.seq)) { u64 missing_start, missing_end; - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; while (seq < le64_to_cpu(i->j.seq) && bch2_journal_seq_is_blacklisted(c, seq, false)) @@ -1094,15 +1229,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, !bch2_journal_seq_is_blacklisted(c, seq, false)) seq++; - if (i->list.prev != list) { - struct printbuf out = PBUF(buf1); - struct journal_replay *p = list_prev_entry(i, list); - - bch2_journal_ptrs_to_text(&out, c, p); - pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); + if (prev) { + bch2_journal_ptrs_to_text(&buf1, c, prev); + prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); } else - sprintf(buf1, "(none)"); - bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); + prt_printf(&buf1, "(none)"); + bch2_journal_ptrs_to_text(&buf2, c, i); missing_end = seq - 1; fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" @@ -1110,13 +1242,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, " next at %s", missing_start, missing_end, last_seq, *blacklist_seq - 1, - buf1, buf2); + buf1.buf, buf2.buf); + + printbuf_exit(&buf1); + printbuf_exit(&buf2); } + prev = i; seq++; } - list_for_each_entry(i, list, list) { + genradix_for_each(&c->journal_entries, radix_iter, _i) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas = { @@ -1124,14 +1260,28 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, .e.nr_required = 1, }; unsigned ptr; - char buf[80]; - if (i->ignore) + i = *_i; + if (!i || i->ignore) continue; - ret = jset_validate_entries(c, &i->j, READ); + for (ptr = 0; ptr < i->nr_ptrs; ptr++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + + if (!i->ptrs[ptr].csum_good) + printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n", + ca->name, i->ptrs[ptr].sector, + le64_to_cpu(i->j.seq), + i->csum_good ? " (had good copy on another device)" : ""); + } + + ret = jset_validate(c, + bch_dev_bkey_exists(c, i->ptrs[0].dev), + &i->j, + i->ptrs[0].sector, + READ); if (ret) - goto fsck_err; + goto err; for (ptr = 0; ptr < i->nr_ptrs; ptr++) replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; @@ -1143,15 +1293,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, * the devices - this is wrong: */ + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, &replicas.e); + if (!degraded && - (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, - "superblock not marked as containing replicas %s", - (bch2_replicas_entry_to_text(&PBUF(buf), - &replicas.e), buf)))) { + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", + buf.buf)) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) - return ret; + goto err; } for_each_jset_key(k, _n, entry, &i->j) @@ -1165,7 +1316,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (*start_seq != *blacklist_seq) bch_info(c, "dropped unflushed entries %llu-%llu", *blacklist_seq, *start_seq - 1); +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1292,49 +1445,6 @@ done: return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } -static void journal_write_compact(struct jset *jset) -{ - struct jset_entry *i, *next, *prev = NULL; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each_safe(jset, i, next) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); - } - - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { /* we aren't holding j->lock: */ @@ -1360,7 +1470,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) { - return j->buf + j->reservations.unwritten_idx; + return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); } static void journal_write_done(struct closure *cl) @@ -1397,15 +1507,18 @@ static void journal_write_done(struct closure *cl) journal_seq_pin(j, seq)->devs = w->devs_written; if (!err) { - j->seq_ondisk = seq; - if (!JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; + + bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; + j->seq_ondisk = seq; + /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard * more buckets: @@ -1413,7 +1526,8 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - journal_reclaim_kick(&c->journal); + if (j->watermark) + journal_reclaim_kick(&c->journal); /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1421,7 +1535,7 @@ static void journal_write_done(struct closure *cl) v = atomic64_read(&j->reservations.counter); do { old.v = new.v = v; - BUG_ON(new.idx == new.unwritten_idx); + BUG_ON(journal_state_count(new, new.unwritten_idx)); new.unwritten_idx++; } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -1432,13 +1546,24 @@ static void journal_write_done(struct closure *cl) closure_wake_up(&w->wait); journal_wake(j); - if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) - mod_delayed_work(c->io_complete_wq, &j->write_work, 0); - spin_unlock(&j->lock); - - if (new.unwritten_idx != new.idx && - !journal_state_count(new, new.unwritten_idx)) + if (!journal_state_count(new, new.unwritten_idx) && + journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + + /* + * We don't close a journal entry to write it while there's + * previous entries still in flight - the current journal entry + * might want to be written now: + */ + + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) @@ -1483,12 +1608,10 @@ static void do_journal_write(struct closure *cl) sectors); bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; - bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ca->prev_journal_sector = bio->bi_iter.bi_sector; @@ -1500,7 +1623,7 @@ static void do_journal_write(struct closure *cl) bch2_bio_map(bio, w->data, sectors << 9); - trace_journal_write(bio); + trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); ca->journal.bucket_seq[ca->journal.cur_idx] = @@ -1520,7 +1643,7 @@ void bch2_journal_write(struct closure *cl) struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - char *journal_debug_buf = NULL; + struct printbuf journal_debug_buf = PRINTBUF; bool validate_before_checksum = false; unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; @@ -1533,11 +1656,11 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); spin_lock(&j->lock); - if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && - (w->noflush || - (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { + if (bch2_journal_error(j) || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; @@ -1574,10 +1697,8 @@ void bch2_journal_write(struct closure *cl) le32_add_cpu(&jset->u64s, u64s); BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); - journal_write_compact(jset); - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); @@ -1594,18 +1715,21 @@ void bch2_journal_write(struct closure *cl) validate_before_checksum = true; if (validate_before_checksum && - jset_validate_for_write(c, jset)) + jset_validate(c, NULL, jset, 0, WRITE)) goto err; - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret)) + goto err; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate_for_write(c, jset)) + jset_validate(c, NULL, jset, 0, WRITE)) goto err; sectors = vstruct_sectors(jset, c->block_bits); @@ -1624,11 +1748,8 @@ retry_alloc: goto retry_alloc; } - if (ret) { - journal_debug_buf = kmalloc(4096, GFP_ATOMIC); - if (journal_debug_buf) - __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); - } + if (ret) + __bch2_journal_debug_to_text(&journal_debug_buf, j); /* * write is allocated, no longer need to account for it in @@ -1645,8 +1766,8 @@ retry_alloc: if (ret) { bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf); - kfree(journal_debug_buf); + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); bch2_fatal_error(c); continue_at(cl, journal_write_done, c->io_complete_wq); return; @@ -1654,7 +1775,7 @@ retry_alloc: w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (test_bit(JOURNAL_NOCHANGES, &j->flags)) + if (c->opts.nochanges) goto no_io; for_each_rw_member(ca, c, i) @@ -1668,9 +1789,7 @@ retry_alloc: percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index d8425fe..2f8bbf0 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -7,12 +7,16 @@ * during cache_registration */ struct journal_replay { - struct list_head list; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; + } ptrs[BCH_REPLICAS_MAX]; unsigned nr_ptrs; - /* checksum error, but we may want to try using it anyways: */ - bool bad; + bool csum_good; bool ignore; /* must be last: */ struct jset j; @@ -40,12 +44,15 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_entry_validate(struct bch_fs *, const char *, +int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, int); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); -int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); +void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct journal_replay *); + +int bch2_journal_read(struct bch_fs *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 52a3935..e873ce2 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" +#include "errcode.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -34,10 +35,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { - unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) - ? ((journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr) - : ja->nr; + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; /* * Don't use the last bucket unless writing the new last_seq @@ -61,25 +60,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } -static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) -{ - unsigned sectors = 0; - - while (!sectors && *idx != j->reservations.idx) { - sectors = j->buf[*idx].sectors; - - *idx = (*idx + 1) & JOURNAL_BUF_MASK; - } - - return sectors; -} - static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { struct journal_device *ja = &ca->journal; - unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; + unsigned sectors, buckets, unwritten; + u64 seq; if (from == journal_space_total) return (struct journal_space) { @@ -94,7 +81,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, * We that we don't allocate the space for a journal entry * until we write it out - thus, account for it here: */ - while ((unwritten = get_unwritten_sectors(j, &idx))) { + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; + + if (!unwritten) + continue; + /* entry won't fit on this device, skip: */ if (unwritten > ca->mi.bucket_size) continue; @@ -202,7 +196,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = cur_entry_insufficient_devices; + ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -216,28 +210,29 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!clean_ondisk && - j->reservations.idx == - j->reservations.unwritten_idx) { - char *buf = kmalloc(4096, GFP_ATOMIC); - - bch_err(c, "journal stuck"); - if (buf) { - __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); - pr_err("\n%s", buf); - kfree(buf); - } + journal_cur_seq(j) == j->seq_ondisk) { + struct printbuf buf = PRINTBUF; + __bch2_journal_debug_to_text(&buf, j); + bch_err(c, "journal stuck\n%s", buf.buf); + printbuf_exit(&buf); + + /* + * Hack: bch2_fatal_error() calls bch2_journal_halt() which + * takes journal lock: + */ + spin_unlock(&j->lock); bch2_fatal_error(c); - ret = cur_entry_journal_stuck; + spin_lock(&j->lock); + + ret = JOURNAL_ERR_journal_stuck; } else if (!j->space[journal_space_discarded].next_entry) - ret = cur_entry_journal_full; - else if (!fifo_free(&j->pin)) - ret = cur_entry_journal_pin_full; + ret = JOURNAL_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && (clean - clean_ondisk <= total / 8) && - (clean_ondisk * 2 > clean )) + (clean_ondisk * 2 > clean)) set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); @@ -251,7 +246,7 @@ out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); - journal_check_may_get_unreserved(j); + journal_set_watermark(j); if (!ret) journal_wake(j); @@ -286,12 +281,13 @@ void bch2_journal_do_discards(struct journal *j) struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + if (!c->opts.nochanges && + ca->mi.discard && + bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, ja->buckets[ja->discard_idx]), - ca->mi.bucket_size, GFP_NOIO, 0); + ca->mi.bucket_size, GFP_NOIO); spin_lock(&j->lock); ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -367,15 +363,12 @@ static inline void __journal_pin_drop(struct journal *j, list_del_init(&pin->list); /* - * Unpinning a journal entry make make journal_next_bucket() succeed, if + * Unpinning a journal entry may make journal_next_bucket() succeed if * writing a new last_seq will now make another bucket available: */ if (atomic_dec_and_test(&pin_list->count) && pin_list == &fifo_peek_front(&j->pin)) bch2_journal_reclaim_fast(j); - else if (fifo_used(&j->pin) == 1 && - atomic_read(&pin_list->count) == 1) - journal_wake(j); } void bch2_journal_pin_drop(struct journal *j, @@ -597,7 +590,7 @@ static u64 journal_seq_to_flush(struct journal *j) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -static int __bch2_journal_reclaim(struct journal *j, bool direct) +static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool kthread = (current->flags & PF_KTHREAD) != 0; @@ -646,8 +639,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) min_nr = 1; - trace_journal_reclaim_start(c, - min_nr, + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); + + trace_and_count(c, journal_reclaim_start, c, + direct, kicked, + min_nr, min_key_cache, j->prereserved.reserved, j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), @@ -655,8 +651,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr, min_key_cache); @@ -664,11 +658,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) j->nr_direct_reclaim += nr_flushed; else j->nr_background_reclaim += nr_flushed; - trace_journal_reclaim_finish(c, nr_flushed); + trace_and_count(c, journal_reclaim_finish, c, nr_flushed); if (nr_flushed) wake_up(&j->reclaim_wait); - } while ((min_nr || min_key_cache) && !direct); + } while ((min_nr || min_key_cache) && nr_flushed && !direct); memalloc_noreclaim_restore(flags); @@ -677,7 +671,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) int bch2_journal_reclaim(struct journal *j) { - return __bch2_journal_reclaim(j, true); + return __bch2_journal_reclaim(j, true, true); } static int bch2_journal_reclaim_thread(void *arg) @@ -685,6 +679,7 @@ static int bch2_journal_reclaim_thread(void *arg) struct journal *j = arg; struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned long delay, now; + bool journal_empty; int ret = 0; set_freezable(); @@ -692,10 +687,12 @@ static int bch2_journal_reclaim_thread(void *arg) j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { + bool kicked = j->reclaim_kicked; + j->reclaim_kicked = false; mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false); + ret = __bch2_journal_reclaim(j, false, kicked); mutex_unlock(&j->reclaim_lock); now = jiffies; @@ -711,10 +708,17 @@ static int bch2_journal_reclaim_thread(void *arg) break; if (j->reclaim_kicked) break; - if (time_after_eq(jiffies, j->next_reclaim)) - break; - freezable_schedule_timeout(j->next_reclaim - jiffies); + spin_lock(&j->lock); + journal_empty = fifo_empty(&j->pin); + spin_unlock(&j->lock); + + if (journal_empty) + freezable_schedule(); + else if (time_after(j->next_reclaim, jiffies)) + freezable_schedule_timeout(j->next_reclaim - jiffies); + else + break; } __set_current_state(TASK_RUNNING); } @@ -738,15 +742,17 @@ int bch2_journal_reclaim_start(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct task_struct *p; + int ret; if (j->reclaim_thread) return 0; p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); - if (IS_ERR(p)) { - bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p)); - return PTR_ERR(p); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); + return ret; } get_task_struct(p); @@ -766,7 +772,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; + if (journal_flush_pins(j, seq_to_flush, 0, 0)) + *did_work = true; spin_lock(&j->lock); /* @@ -775,8 +782,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, */ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || journal_last_seq(j) > seq_to_flush || - (fifo_used(&j->pin) == 1 && - atomic_read(&fifo_peek_front(&j->pin).count) == 1); + !fifo_used(&j->pin); spin_unlock(&j->lock); mutex_unlock(&j->reclaim_lock); @@ -824,10 +830,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); - while (!ret && seq < j->pin.back) { + while (!ret) { struct bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); + if (seq >= j->pin.back) + break; bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, journal_seq_pin(j, seq)->devs); seq++; diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c new file mode 100644 index 0000000..c19db04 --- /dev/null +++ b/libbcachefs/journal_sb.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "journal_sb.h" +#include "darray.h" + +#include + +/* BCH_SB_FIELD_journal: */ + +static int u64_cmp(const void *_l, const void *_r) +{ + const u64 *l = _l; + const u64 *r = _r; + + return cmp_int(*l, *r); +} + +static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + u64 *b; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + if (!b[0]) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0] < le16_to_cpu(m->first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) { + prt_printf(err, "duplicate journal buckets %llu", b[i]); + goto err; + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + unsigned i, nr = bch2_nr_journal_buckets(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_journal_validate, + .to_text = bch2_sb_journal_to_text, +}; + +struct u64_range { + u64 start; + u64 end; +}; + +static int u64_range_cmp(const void *_l, const void *_r) +{ + const struct u64_range *l = _l; + const struct u64_range *r = _r; + + return cmp_int(l->start, r->start); +} + +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + struct u64_range *b; + + nr = bch2_sb_field_journal_v2_nr_entries(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + b[i].start = le64_to_cpu(journal->d[i].start); + b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + } + + sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + + if (!b[0].start) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0].start < le16_to_cpu(m->first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0].start, le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) { + if (b[i].end > b[i + 1].start) { + prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu-%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { + .validate = bch2_sb_journal_v2_validate, + .to_text = bch2_sb_journal_v2_to_text, +}; + +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr = 1; + + if (c) + lockdep_assert_held(&c->sb_lock); + + if (!ja->nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); + return 0; + } + + for (i = 0; i + 1 < ja->nr; i++) + if (ja->buckets[i] + 1 != ja->buckets[i + 1]) + nr++; + + j = bch2_sb_resize_journal_v2(&ca->disk_sb, + (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); + if (!j) + return -BCH_ERR_ENOSPC_sb_journal; + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + + j->d[dst].start = le64_to_cpu(ja->buckets[0]); + j->d[dst].nr = le64_to_cpu(1); + + for (i = 1; i < ja->nr; i++) { + if (ja->buckets[i] == ja->buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; + j->d[dst].start = le64_to_cpu(ja->buckets[i]); + j->d[dst].nr = le64_to_cpu(1); + } + } + + BUG_ON(dst + 1 != nr); + + return 0; +} diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h new file mode 100644 index 0000000..a39192e --- /dev/null +++ b/libbcachefs/journal_sb.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "super-io.h" +#include "vstructs.h" + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) +{ + if (!j) + return 0; + + return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal; +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; + +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 3cc63fc..5c555b3 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -201,7 +201,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, if (le64_to_cpu(e->start) >= le64_to_cpu(e->end)) { - pr_buf(err, "entry %u start >= end (%llu >= %llu)", + prt_printf(err, "entry %u start >= end (%llu >= %llu)", i, le64_to_cpu(e->start), le64_to_cpu(e->end)); return -EINVAL; } @@ -209,7 +209,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, if (i + 1 < nr && le64_to_cpu(e[0].end) > le64_to_cpu(e[1].start)) { - pr_buf(err, "entry %u out of order with next entry (%llu > %llu)", + prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); return -EINVAL; } @@ -229,12 +229,13 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, for (i = bl->start; i < bl->start + nr; i++) { if (i != bl->start) - pr_buf(out, " "); + prt_printf(out, " "); - pr_buf(out, "%llu-%llu", + prt_printf(out, "%llu-%llu", le64_to_cpu(i->start), le64_to_cpu(i->end)); } + prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { @@ -271,7 +272,7 @@ retry: !test_bit(BCH_FS_STOPPING, &c->flags)) b = bch2_btree_iter_next_node(&iter); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(&trans, &iter); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index d6d7512..a6cdb88 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -25,6 +25,8 @@ struct journal_buf { struct closure_waitlist wait; u64 last_seq; /* copy of data->last_seq */ + long expires; + u64 flush_time; unsigned buf_size; /* size in bytes of @data */ unsigned sectors; /* maximum size for current entry */ @@ -139,19 +141,39 @@ enum journal_space_from { journal_space_nr, }; -/* - * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, - * either because something's waiting on the write to complete or because it's - * been dirty too long and the timer's expired. - */ - enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_NEED_WRITE, - JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NOCHANGES, +}; + +#define JOURNAL_WATERMARKS() \ + x(any) \ + x(copygc) \ + x(reserved) + +enum journal_watermark { +#define x(n) JOURNAL_WATERMARK_##n, + JOURNAL_WATERMARKS() +#undef x +}; + +#define JOURNAL_WATERMARK_MASK 3 + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x }; /* Embedded in struct bch_fs */ @@ -161,6 +183,7 @@ struct journal { unsigned long flags; union journal_res_state reservations; + enum journal_watermark watermark; /* Max size of current journal entry */ unsigned cur_entry_u64s; @@ -170,14 +193,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum { - cur_entry_ok, - cur_entry_blocked, - cur_entry_journal_full, - cur_entry_journal_pin_full, - cur_entry_journal_stuck, - cur_entry_insufficient_devices, - } cur_entry_error; + enum journal_errors cur_entry_error; union journal_preres_state prereserved; @@ -245,6 +261,10 @@ struct journal { spinlock_t err_lock; struct mutex reclaim_lock; + /* + * Used for waiting until journal reclaim has freed up space in the + * journal: + */ wait_queue_head_t reclaim_wait; struct task_struct *reclaim_thread; bool reclaim_kicked; @@ -264,7 +284,6 @@ struct journal { unsigned long last_flush_write; u64 res_get_blocked_start; - u64 need_write_time; u64 write_start_time; u64 nr_flush_writes; diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c index cda7783..5e85055 100644 --- a/libbcachefs/keylist.c +++ b/libbcachefs/keylist.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey.h" #include "keylist.h" int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c new file mode 100644 index 0000000..53e607d --- /dev/null +++ b/libbcachefs/lru.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" + +int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + if (bkey_val_bytes(k.k) < sizeof(*lru)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*lru)); + return -EINVAL; + } + + return 0; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); +} + +int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time, + struct bkey_s_c orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 existing_idx; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!time) + return 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + POS(id, time), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_lru) { + bch2_bkey_val_to_text(&buf, trans->c, orig_k); + bch2_trans_inconsistent(trans, + "pointer to nonexistent lru %llu:%llu\n%s", + id, time, buf.buf); + ret = -EIO; + goto err; + } + + existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + if (existing_idx != idx) { + bch2_bkey_val_to_text(&buf, trans->c, orig_k); + bch2_trans_inconsistent(trans, + "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s", + id, time, existing_idx, idx, buf.buf); + ret = -EIO; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_lru *lru; + int ret = 0; + + if (!*time) + return 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, + POS(lru_id, *time), + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES, k, ret) + if (bkey_deleted(k.k)) + break; + + if (ret) + goto err; + + BUG_ON(iter.pos.inode != lru_id); + *time = iter.pos.offset; + + lru = bch2_trans_kmalloc(trans, sizeof(*lru)); + ret = PTR_ERR_OR_ZERO(lru); + if (ret) + goto err; + + bkey_lru_init(&lru->k_i); + lru->k.p = iter.pos; + lru->v.idx = cpu_to_le64(idx); + + ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, + u64 old_time, u64 *new_time, + struct bkey_s_c k) +{ + if (old_time == *new_time) + return 0; + + return bch2_lru_delete(trans, id, idx, old_time, k) ?: + bch2_lru_set(trans, id, idx, new_time); +} + +static int bch2_check_lru_key(struct btree_trans *trans, + struct btree_iter *lru_iter, + struct bkey_s_c lru_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct bpos alloc_pos; + int ret; + + alloc_pos = POS(lru_k.k->p.inode, + le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx)); + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, + "lru key points to nonexistent device:bucket %llu:%llu", + alloc_pos.inode, alloc_pos.offset)) + return bch2_btree_delete_at(trans, lru_iter, 0); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_alloc_to_v4(k, &a); + + if (fsck_err_on(a.data_type != BCH_DATA_cached || + a.io_time[READ] != lru_k.k->p.offset, c, + "incorrect lru entry %s\n" + " for %s", + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.p = lru_iter->pos; + + ret = bch2_trans_update(trans, lru_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +int bch2_check_lrus(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_lru_key(&trans, &iter, k)); + + bch2_trans_exit(&trans); + return ret; + +} diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h new file mode 100644 index 0000000..3decb7b --- /dev/null +++ b/libbcachefs/lru.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H + +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_lru (struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ +} + +int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c); +int bch2_lru_set(struct btree_trans *, u64, u64, u64 *); +int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c); + +int bch2_check_lrus(struct bch_fs *); + +#endif /* _BCACHEFS_LRU_H */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 6defc33..8b258d9 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -8,6 +8,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" +#include "errcode.h" #include "extents.h" #include "io.h" #include "journal.h" @@ -35,85 +36,76 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } -static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, - enum btree_id btree_id) +static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + int flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + if (!bch2_bkey_has_device(k, dev_idx)) + return 0; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); + if (ret) + return ret; + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + + return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_buf sk; + enum btree_id id; int ret = 0; - bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - - while ((bch2_trans_begin(&trans), - (k = bch2_btree_iter_peek(&iter)).k) && - !(ret = bkey_err(k))) { - if (!bch2_bkey_has_device(k, dev_idx)) { - bch2_btree_iter_advance(&iter); + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) continue; - } - - bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), - dev_idx, flags, false); - if (ret) - break; - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&sk.k->k)) - sk.k->k.size = 0; - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, sk.k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - - /* - * don't want to leave ret == -EINTR, since if we raced and - * something else overwrote the key we could spuriously return - * -EINTR below: - */ - if (ret == -EINTR) - ret = 0; + ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - - BUG_ON(ret == -EINTR); return ret; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -{ - return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?: - __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink); -} - static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; @@ -154,19 +146,20 @@ retry: } ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); - if (ret == -EINTR) { + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { - bch_err(c, "Error updating btree node key: %i", ret); + bch_err(c, "Error updating btree node key: %s", + bch2_err_str(ret)); break; } next: bch2_btree_iter_next_node(&iter); } - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(&trans, &iter); @@ -175,16 +168,13 @@ next: goto err; } - /* flush relevant btree updates */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - + bch2_btree_interior_updates_flush(c); ret = 0; err: bch2_trans_exit(&trans); bch2_bkey_buf_exit(&k, c); - BUG_ON(ret == -EINTR); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 7ca7ce3..7486920 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -2,19 +2,20 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" -#include "buckets.h" #include "disk_groups.h" #include "ec.h" +#include "errcode.h" +#include "error.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" #include "move.h" #include "replicas.h" -#include "subvolume.h" #include "super-io.h" #include "keylist.h" @@ -23,7 +24,19 @@ #include -#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 +static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_add(&stats->list, &c->data_progress_list); + mutex_unlock(&c->data_progress_lock); +} + +static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_del(&stats->list); + mutex_unlock(&c->data_progress_lock); +} struct moving_io { struct list_head list; @@ -35,415 +48,30 @@ struct moving_io { struct bch_read_bio rbio; - struct migrate_write write; + struct data_update write; /* Must be last since it is variable size */ struct bio_vec bi_inline_vecs[0]; }; -struct moving_context { - /* Closure for waiting on all reads and writes to complete */ - struct closure cl; - - struct bch_move_stats *stats; - - struct list_head reads; - - /* in flight sectors: */ - atomic_t read_sectors; - atomic_t write_sectors; - - wait_queue_head_t wait; -}; - -static int insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, update_iter; - struct bkey_s_c k; - struct snapshots_seen s; - int ret; - - if (!btree_type_has_snapshots(id)) - return 0; - - snapshots_seen_init(&s); - - if (!bkey_cmp(old_pos, new_pos)) - return 0; - - if (!snapshot_t(c, old_pos.snapshot)->children[0]) - return 0; - - bch2_trans_iter_init(trans, &iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { -next: - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (bkey_cmp(old_pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { - struct bkey_i *update; - size_t i; - - for (i = 0; i < s.nr; i++) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) - goto next; - - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - break; - - bkey_init(&update->k); - update->k.p = new_pos; - update->k.p.snapshot = k.k->p.snapshot; - - bch2_trans_iter_init(trans, &update_iter, id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) - break; - - ret = snapshots_seen_add(c, &s, k.k->p.snapshot); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - kfree(s.d); - - return ret; -} - -static int bch2_migrate_index_update(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter iter; - struct migrate_write *m = - container_of(op, struct migrate_write, op); - struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; - int ret = 0; - - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - bch2_trans_iter_init(&trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - while (1) { - struct bkey_s_c k; - struct bkey_i *insert; - struct bkey_i_extent *new; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bpos next_pos; - bool did_work = false; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - - bch2_trans_begin(&trans); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - new = bkey_i_to_extent(bch2_keylist_front(keys)); - - if (bversion_cmp(k.k->version, new->k.version) || - !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) - goto nomatch; - - bkey_reassemble(_insert.k, k); - insert = _insert.k; - - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); - bch2_cut_front(iter.pos, &new->k_i); - - bch2_cut_front(iter.pos, insert); - bch2_cut_back(new->k.p, insert); - bch2_cut_back(insert->k.p, &new->k_i); - - if (m->data_cmd == DATA_REWRITE) { - struct bch_extent_ptr *new_ptr, *old_ptr = (void *) - bch2_bkey_has_device(bkey_i_to_s_c(insert), - m->data_opts.rewrite_dev); - if (!old_ptr) - goto nomatch; - - if (old_ptr->cached) - extent_for_each_ptr(extent_i_to_s(new), new_ptr) - new_ptr->cached = true; - - __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); - } - - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { - /* - * raced with another move op? extent already - * has a pointer to the device we just wrote - * data to - */ - continue; - } - - bch2_extent_ptr_decoded_append(insert, &p); - did_work = true; - } - - if (!did_work) - goto nomatch; - - bch2_bkey_narrow_crcs(insert, - (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); - bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); - - ret = bch2_sum_sector_overwrites(&trans, &iter, insert, - &should_check_enospc, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - goto err; - - if (disk_sectors_delta > (s64) op->res.sectors) { - ret = bch2_disk_reservation_add(c, &op->res, - disk_sectors_delta - op->res.sectors, - !should_check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto out; - } - - next_pos = insert->k.p; - - ret = insert_snapshot_whiteouts(&trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_trans_update(&trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(&trans, &op->res, - op_journal_seq(op), - BTREE_INSERT_NOFAIL| - m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); - atomic_long_inc(&c->extent_migrate_done); - if (ec_ob) - bch2_ob_add_backpointer(c, ec_ob, &insert->k); - } -err: - if (ret == -EINTR) - ret = 0; - if (ret) - break; -next: - while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) - goto out; - } - continue; -nomatch: - if (m->ctxt) { - BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->ctxt->stats->keys_raced); - atomic64_add(k.k->p.offset - iter.pos.offset, - &m->ctxt->stats->sectors_raced); - } - atomic_long_inc(&c->extent_migrate_raced); - trace_move_race(&new->k); - bch2_btree_iter_advance(&iter); - goto next; - } -out: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); - BUG_ON(ret == -EINTR); - return ret; -} - -void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) -{ - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - - m->ptr = rbio->pick.ptr; - m->offset = rbio->data_pos.offset - rbio->pick.crc.offset; - m->op.devs_have = rbio->devs_have; - m->op.pos = rbio->data_pos; - m->op.version = rbio->version; - m->op.crc = rbio->pick.crc; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - - if (m->data_cmd == DATA_REWRITE) - bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); -} - -int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, - struct write_point_specifier wp, - struct bch_io_opts io_opts, - enum data_cmd data_cmd, - struct data_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - struct extent_ptr_decoded p; - int ret; - - m->btree_id = btree_id; - m->data_cmd = data_cmd; - m->data_opts = data_opts; - m->nr_ptrs_reserved = 0; - - bch2_write_op_init(&m->op, c, io_opts); - - if (!bch2_bkey_is_incompressible(k)) - m->op.compression_type = - bch2_compression_opt_to_type[io_opts.background_compression ?: - io_opts.compression]; - else - m->op.incompressible = true; - - m->op.target = data_opts.target, - m->op.write_point = wp; - - /* - * op->csum_type is normally initialized from the fs/file's current - * options - but if an extent is encrypted, we require that it stays - * encrypted: - */ - bkey_for_each_crc(k.k, ptrs, crc, entry) - if (bch2_csum_type_is_encryption(crc.csum_type)) { - m->op.nonce = crc.nonce + crc.offset; - m->op.csum_type = crc.csum_type; - break; - } - - if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { - m->op.alloc_reserve = RESERVE_MOVINGGC; - m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; - } else { - /* XXX: this should probably be passed in */ - m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; - } - - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_FROM_INTERNAL; - - m->op.nr_replicas = data_opts.nr_replicas; - m->op.nr_replicas_required = data_opts.nr_replicas; - m->op.index_update_fn = bch2_migrate_index_update; - - switch (data_cmd) { - case DATA_ADD_REPLICAS: { - /* - * DATA_ADD_REPLICAS is used for moving data to a different - * device in the background, and due to compression the new copy - * might take up more space than the old copy: - */ -#if 0 - int nr = (int) io_opts.data_replicas - - bch2_bkey_nr_ptrs_allocated(k); -#endif - int nr = (int) io_opts.data_replicas; - - if (nr > 0) { - m->op.nr_replicas = m->nr_ptrs_reserved = nr; - - ret = bch2_disk_reservation_get(c, &m->op.res, - k.k->size, m->op.nr_replicas, 0); - if (ret) - return ret; - } - break; - } - case DATA_REWRITE: { - unsigned compressed_sectors = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == data_opts.rewrite_dev) { - if (p.ptr.cached) - m->op.flags |= BCH_WRITE_CACHED; - - if (!p.ptr.cached && - crc_is_compressed(p.crc)) - compressed_sectors += p.crc.compressed_size; - } - - if (compressed_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, - k.k->size * m->op.nr_replicas, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - break; - } - case DATA_PROMOTE: - m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; - m->op.flags |= BCH_WRITE_CACHED; - break; - default: - BUG(); - } - - return 0; -} - static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; - struct bvec_iter_all iter; - struct bio_vec *bv; - - bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) - if (bv->bv_page) - __free_page(bv->bv_page); + struct bch_fs *c = ctxt->c; + bch2_data_update_exit(&io->write); wake_up(&ctxt->wait); - + percpu_ref_put(&c->writes); kfree(io); } static void move_write_done(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->write.ctxt; + + if (io->write.op.error) + ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); closure_return_with_destructor(cl, move_free); @@ -458,10 +86,9 @@ static void move_write(struct closure *cl) return; } - bch2_migrate_read_done(&io->write, &io->rbio); - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - closure_call(&io->write.op.cl, bch2_write, NULL, cl); + + bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl); continue_at(cl, move_write_done, NULL); } @@ -481,9 +108,7 @@ static void move_read_endio(struct bio *bio) atomic_sub(io->read_sectors, &ctxt->read_sectors); io->read_completed = true; - if (next_pending_write(ctxt)) - wake_up(&ctxt->wait); - + wake_up(&ctxt->wait); closure_put(&ctxt->cl); } @@ -520,14 +145,103 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, atomic_read(&ctxt->write_sectors) != sectors_pending); } +void bch2_moving_ctxt_exit(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + EBUG_ON(atomic_read(&ctxt->write_sectors)); + + if (ctxt->stats) { + progress_list_del(ctxt->c, ctxt->stats); + + trace_move_data(ctxt->c, + atomic64_read(&ctxt->stats->sectors_moved), + atomic64_read(&ctxt->stats->keys_moved)); + } +} + +void bch2_moving_ctxt_init(struct moving_context *ctxt, + struct bch_fs *c, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->c = c; + ctxt->rate = rate; + ctxt->stats = stats; + ctxt->wp = wp; + ctxt->wait_on_copygc = wait_on_copygc; + + closure_init_stack(&ctxt->cl); + INIT_LIST_HEAD(&ctxt->reads); + init_waitqueue_head(&ctxt->wait); + + if (stats) { + progress_list_add(c, stats); + stats->data_type = BCH_DATA_user; + } +} + +void bch_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + +static int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + while (data_opts.kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + data_opts.kill_ptrs ^= 1U << drop; + } + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + + return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + static int bch2_move_extent(struct btree_trans *trans, + struct btree_iter *iter, struct moving_context *ctxt, - struct write_point_specifier wp, struct bch_io_opts io_opts, enum btree_id btree_id, struct bkey_s_c k, - enum data_cmd data_cmd, - struct data_opts data_opts) + struct data_update_opts data_opts) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -537,6 +251,18 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + bch2_data_update_opts_normalize(k, &data_opts); + + if (!data_opts.rewrite_ptrs && + !data_opts.extra_replicas) { + if (data_opts.kill_ptrs) + return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return 0; + } + + if (!percpu_ref_tryget_live(&c->writes)) + return -EROFS; + /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -551,7 +277,7 @@ static int bch2_move_extent(struct btree_trans *trans, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); + bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); bio_set_prio(&io->write.op.wbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); @@ -561,7 +287,7 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.c = c; io->rbio.opts = io_opts; - bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); + bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); io->rbio.bio.bi_vcnt = pages; bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -570,15 +296,18 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, btree_id, k); + ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, + data_opts, btree_id, k); if (ret) goto err_free_pages; + io->write.ctxt = ctxt; + atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - - trace_move_extent(k.k); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); + this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); + trace_move_extent_read(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); @@ -599,7 +328,8 @@ err_free_pages: err_free: kfree(io); err: - trace_move_alloc_fail(k.k); + percpu_ref_put(&c->writes); + trace_and_count(c, move_extent_alloc_mem_fail, k.k); return ret; } @@ -634,72 +364,108 @@ err: return ret; } -static int __bch2_move_data(struct bch_fs *c, - struct moving_context *ctxt, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats, - enum btree_id btree_id) +static int move_ratelimit(struct btree_trans *trans, + struct moving_context *ctxt) { - bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_fs *c = trans->c; + u64 delay; + + if (ctxt->wait_on_copygc) { + bch2_trans_unlock(trans); + wait_event_killable(c->copygc_running_wq, + !c->copygc_running || + kthread_should_stop()); + } + + do { + delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + + if (delay) { + bch2_trans_unlock(trans); + set_current_state(TASK_INTERRUPTIBLE); + } + + if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 1; + } + + if (delay) + schedule_timeout(delay); + + if (unlikely(freezing(current))) { + move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->write_sectors) < + c->opts.move_bytes_in_flight >> 9); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->read_sectors) < + c->opts.move_bytes_in_flight >> 9); + + return 0; +} + +static int move_get_io_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c k, u64 *cur_inum) +{ + struct bch_inode_unpacked inode; + int ret; + + if (*cur_inum == k.k->p.inode) + return 0; + + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + + ret = lookup_inode(trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret) + bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); + + *cur_inum = k.k->p.inode; + return 0; +} + +static int __bch2_move_data(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) +{ + struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct data_opts data_opts; - enum data_cmd data_cmd; - u64 delay, cur_inum = U64_MAX; + struct data_update_opts data_opts; + u64 cur_inum = U64_MAX; int ret = 0, ret2; bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_user; - stats->btree_id = btree_id; - stats->pos = start; + ctxt->stats->data_type = BCH_DATA_user; + ctxt->stats->btree_id = btree_id; + ctxt->stats->pos = start; bch2_trans_iter_init(&trans, &iter, btree_id, start, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS); - if (rate) - bch2_ratelimit_reset(rate); - - while (1) { - do { - delay = rate ? bch2_ratelimit_delay(rate) : 0; - - if (delay) { - bch2_trans_unlock(&trans); - set_current_state(TASK_INTERRUPTIBLE); - } - - if (kthread && (ret = kthread_should_stop())) { - __set_current_state(TASK_RUNNING); - goto out; - } - - if (delay) - schedule_timeout(delay); - - if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); - try_to_freeze(); - } - } while (delay); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); + while (!move_ratelimit(&trans, ctxt)) { bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); @@ -707,7 +473,7 @@ static int __bch2_move_data(struct bch_fs *c, break; ret = bkey_err(k); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; @@ -715,53 +481,30 @@ static int __bch2_move_data(struct bch_fs *c, if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - stats->pos = iter.pos; + ctxt->stats->pos = iter.pos; if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - if (btree_id == BTREE_ID_extents && - cur_inum != k.k->p.inode) { - struct bch_inode_unpacked inode; - - io_opts = bch2_opts_to_inode_opts(c->opts); - - ret = lookup_inode(&trans, - SPOS(0, k.k->p.inode, k.k->p.snapshot), - &inode); - if (ret == -EINTR) - continue; - - if (!ret) - bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); - - cur_inum = k.k->p.inode; - } + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; - switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { - case DATA_SKIP: + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, k, &io_opts, &data_opts)) goto next; - case DATA_SCRUB: - BUG(); - case DATA_ADD_REPLICAS: - case DATA_REWRITE: - case DATA_PROMOTE: - break; - default: - BUG(); - } /* * The iterator gets unlocked by __bch2_read_extent - need to * save a copy of @k elsewhere: - */ + */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, - data_cmd, data_opts); + ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, + btree_id, k, data_opts); if (ret2) { - if (ret2 == -EINTR) + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; if (ret2 == -ENOMEM) { @@ -774,14 +517,13 @@ static int __bch2_move_data(struct bch_fs *c, goto next; } - if (rate) - bch2_ratelimit_increment(rate, k.k->size); + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); next: - atomic64_add(k.k->size, &stats->sectors_seen); + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: bch2_btree_iter_advance(&iter); } -out: bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -790,48 +532,20 @@ out: return ret; } -inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) -{ - memset(stats, 0, sizeof(*stats)); - - scnprintf(stats->name, sizeof(stats->name), - "%s", name); -} - -static inline void progress_list_add(struct bch_fs *c, - struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_add(&stats->list, &c->data_progress_list); - mutex_unlock(&c->data_progress_lock); -} - -static inline void progress_list_del(struct bch_fs *c, - struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_del(&stats->list); - mutex_unlock(&c->data_progress_lock); -} - int bch2_move_data(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, enum btree_id end_btree_id, struct bpos end_pos, struct bch_ratelimit *rate, + struct bch_move_stats *stats, struct write_point_specifier wp, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) + bool wait_on_copygc, + move_pred_fn pred, void *arg) { - struct moving_context ctxt = { .stats = stats }; + struct moving_context ctxt; enum btree_id id; int ret; - progress_list_add(c, stats); - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - - stats->data_type = BCH_DATA_user; + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); for (id = start_btree_id; id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); @@ -842,31 +556,205 @@ int bch2_move_data(struct bch_fs *c, id != BTREE_ID_reflink) continue; - ret = __bch2_move_data(c, &ctxt, rate, wp, + ret = __bch2_move_data(&ctxt, id == start_btree_id ? start_pos : POS_MIN, id == end_btree_id ? end_pos : POS_MAX, - pred, arg, stats, id); + pred, arg, id); if (ret) break; } + bch2_moving_ctxt_exit(&ctxt); - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); - closure_sync(&ctxt.cl); + return ret; +} + +static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); +again: + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); - EBUG_ON(atomic_read(&ctxt.write_sectors)); + if (!ret && k.k->type == KEY_TYPE_alloc_v4) { + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - trace_move_data(c, - atomic64_read(&stats->sectors_moved), - atomic64_read(&stats->keys_moved)); + if (a.v->gen == gen && + a.v->dirty_sectors) { + struct printbuf buf = PRINTBUF; + + if (a.v->data_type == BCH_DATA_btree) { + bch2_trans_unlock(trans); + if (bch2_btree_interior_updates_flush(c)) + goto again; + } + + prt_str(&buf, "failed to evacuate bucket "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_evacuate_bucket(struct moving_context *ctxt, + struct bpos bucket, int gen, + struct data_update_opts _data_opts) +{ + struct bch_fs *c = ctxt->c; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bch_backpointer bp; + struct data_update_opts data_opts; + u64 bp_offset = 0, cur_inum = U64_MAX; + int ret = 0; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + while (!(ret = move_ratelimit(&trans, ctxt))) { + bch2_trans_begin(&trans); + + ret = bch2_get_next_backpointer(&trans, bucket, gen, + &bp_offset, &bp, + BTREE_ITER_CACHED); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (bp_offset == U64_MAX) + break; + + if (!bp.level) { + const struct bch_extent_ptr *ptr; + struct bkey_s_c k; + unsigned i = 0; + + k = bch2_backpointer_get_key(&trans, &iter, + bucket, bp_offset, bp); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + continue; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) { + bch2_trans_iter_exit(&trans, &iter); + continue; + } + + data_opts = _data_opts; + data_opts.target = io_opts.background_target; + data_opts.rewrite_ptrs = 0; + + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == bucket.inode) + data_opts.rewrite_ptrs |= 1U << i; + i++; + } + + ret = bch2_move_extent(&trans, &iter, ctxt, io_opts, + bp.btree_id, k, data_opts); + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt, &trans); + continue; + } + if (ret) + goto err; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + } else { + struct btree *b; + + b = bch2_backpointer_get_node(&trans, &iter, + bucket, bp_offset, bp); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!b) + continue; + + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, + c->opts.btree_node_size >> 9); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + } + + bp_offset++; + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { + bch2_trans_unlock(&trans); + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + if (!ctxt->write_error) + lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); + } +err: + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct data_update_opts data_opts, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts); + bch2_moving_ctxt_exit(&ctxt); - progress_list_del(c, stats); return ret; } -typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_opts *); +typedef bool (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, @@ -880,8 +768,7 @@ static int bch2_move_btree(struct bch_fs *c, struct btree_iter iter; struct btree *b; enum btree_id id; - struct data_opts data_opts; - enum data_cmd cmd; + struct data_update_opts data_opts; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -910,27 +797,18 @@ retry: stats->pos = iter.pos; - switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { - case DATA_SKIP: + if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - case DATA_SCRUB: - BUG(); - case DATA_ADD_REPLICAS: - case DATA_REWRITE: - break; - default: - BUG(); - } ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; next: bch2_btree_iter_next_node(&iter); } - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(&trans, &iter); @@ -942,30 +820,18 @@ next: bch2_trans_exit(&trans); if (ret) - bch_err(c, "error %i in bch2_move_btree", ret); + bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); - /* flush relevant btree updates */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + bch2_btree_interior_updates_flush(c); progress_list_del(c, stats); return ret; } -#if 0 -static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - return DATA_SCRUB; -} -#endif - -static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rereplicate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); unsigned replicas = bkey_is_btree_ptr(k.k) @@ -973,43 +839,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, : io_opts->data_replicas; if (!nr_good || nr_good >= replicas) - return DATA_SKIP; + return false; data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = replicas - nr_good; data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; + return true; } -static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool migrate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; struct bch_ioctl_data *op = arg; + unsigned i = 0; - if (!bch2_bkey_has_device(k, op->migrate.dev)) - return DATA_SKIP; - + data_opts->rewrite_ptrs = 0; data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = 0; data_opts->btree_insert_flags = 0; - data_opts->rewrite_dev = op->migrate.dev; - return DATA_REWRITE; + + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == op->migrate.dev) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + + return data_opts->rewrite_ptrs != 0; } -static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } -static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } @@ -1038,21 +911,21 @@ static bool bformat_needs_redo(struct bkey_format *f) return false; } -static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (b->version_ondisk != c->sb.version || btree_node_need_rewrite(b) || bformat_needs_redo(&b->format)) { data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = 0; data_opts->btree_insert_flags = 0; - return DATA_REWRITE; + return true; } - return DATA_SKIP; + return false; } int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) @@ -1096,8 +969,11 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, - NULL, writepoint_hashed((unsigned long) current), - rereplicate_pred, c, stats) ?: ret; + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_MIGRATE: @@ -1117,8 +993,11 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, - NULL, writepoint_hashed((unsigned long) current), - migrate_pred, &op, stats) ?: ret; + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_REWRITE_OLD_NODES: diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 2a789a1..c0fec69 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -4,53 +4,37 @@ #include "btree_iter.h" #include "buckets.h" -#include "io_types.h" +#include "data_update.h" #include "move_types.h" struct bch_read_bio; -struct moving_context; -enum data_cmd { - DATA_SKIP, - DATA_SCRUB, - DATA_ADD_REPLICAS, - DATA_REWRITE, - DATA_PROMOTE, -}; - -struct data_opts { - u16 target; - u8 rewrite_dev; - u8 nr_replicas; - int btree_insert_flags; -}; +struct moving_context { + struct bch_fs *c; + struct bch_ratelimit *rate; + struct bch_move_stats *stats; + struct write_point_specifier wp; + bool wait_on_copygc; + bool write_error; -struct migrate_write { - enum btree_id btree_id; - enum data_cmd data_cmd; - struct data_opts data_opts; + /* For waiting on outstanding reads and writes: */ + struct closure cl; + struct list_head reads; - unsigned nr_ptrs_reserved; + /* in flight sectors: */ + atomic_t read_sectors; + atomic_t write_sectors; - struct moving_context *ctxt; - - /* what we read: */ - struct bch_extent_ptr ptr; - u64 offset; - - struct bch_write_op op; + wait_queue_head_t wait; }; -void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); -int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, - struct write_point_specifier, - struct bch_io_opts, - enum data_cmd, struct data_opts, - enum btree_id, struct bkey_s_c); +typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, + struct bch_io_opts *, struct data_update_opts *); -typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, - struct bkey_s_c, - struct bch_io_opts *, struct data_opts *); +void bch2_moving_ctxt_exit(struct moving_context *); +void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); @@ -58,10 +42,20 @@ int bch2_move_data(struct bch_fs *, enum btree_id, struct bpos, enum btree_id, struct bpos, struct bch_ratelimit *, + struct bch_move_stats *, struct write_point_specifier, - move_pred_fn, void *, - struct bch_move_stats *); - + bool, + move_pred_fn, void *); + +int __bch2_evacuate_bucket(struct moving_context *, + struct bpos, int, + struct data_update_opts); +int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct data_update_opts, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index c82ecff..044eca8 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -13,6 +13,7 @@ #include "buckets.h" #include "clock.h" #include "disk_groups.h" +#include "errcode.h" #include "error.h" #include "extents.h" #include "eytzinger.h" @@ -30,107 +31,6 @@ #include #include -/* - * We can't use the entire copygc reserve in one iteration of copygc: we may - * need the buckets we're freeing up to go back into the copygc reserve to make - * forward progress, but if the copygc reserve is full they'll be available for - * any allocation - and it's possible that in a given iteration, we free up most - * of the buckets we're going to free before we allocate most of the buckets - * we're going to allocate. - * - * If we only use half of the reserve per iteration, then in steady state we'll - * always have room in the reserve for the buckets we're going to need in the - * next iteration: - */ -#define COPYGC_BUCKETS_PER_ITER(ca) \ - ((ca)->free[RESERVE_MOVINGGC].size / 2) - -static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: - cmp_int(l->offset, r->offset); -} - -static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - copygc_heap *h = &c->copygc_heap; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct copygc_heap_entry search = { - .dev = p.ptr.dev, - .offset = p.ptr.offset, - }; - ssize_t i; - - if (p.ptr.cached) - continue; - - i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); -#if 0 - /* eytzinger search verify code: */ - ssize_t j = -1, k; - - for (k = 0; k < h->used; k++) - if (h->data[k].offset <= ptr->offset && - (j < 0 || h->data[k].offset > h->data[j].offset)) - j = k; - - BUG_ON(i != j); -#endif - if (i >= 0 && - p.ptr.dev == h->data[i].dev && - p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && - p.ptr.gen == h->data[i].gen) { - /* - * We need to use the journal reserve here, because - * - journal reclaim depends on btree key cache - * flushing to make forward progress, - * - which has to make forward progress when the - * journal is pre-reservation full, - * - and depends on allocation - meaning allocator and - * copygc - */ - - data_opts->target = io_opts->background_target; - data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED; - data_opts->rewrite_dev = p.ptr.dev; - - if (p.has_ec) - data_opts->nr_replicas += p.ec.redundancy; - - return DATA_REWRITE; - } - } - - return DATA_SKIP; -} - -static bool have_copygc_reserve(struct bch_dev *ca) -{ - bool ret; - - spin_lock(&ca->fs->freelist_lock); - ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_state != ALLOCATOR_running; - spin_unlock(&ca->fs->freelist_lock); - - return ret; -} - static inline int fragmentation_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -138,37 +38,46 @@ static inline int fragmentation_cmp(copygc_heap *heap, return cmp_int(l.fragmentation, r.fragmentation); } -static int walk_buckets_to_copygc(struct bch_fs *c) +static int find_buckets_to_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_alloc_unpacked u; + struct bch_alloc_v4 a; int ret; bch2_trans_init(&trans, c, 0, 0); + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, + * and repeatedly replacing the maximum element until all + * buckets have been visited. + */ + h->used = 0; + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); struct copygc_heap_entry e; - u = bch2_alloc_unpack(k); + bch2_alloc_to_v4(k, &a); - if (u.data_type != BCH_DATA_user || - u.dirty_sectors >= ca->mi.bucket_size || + if ((a.data_type != BCH_DATA_btree && + a.data_type != BCH_DATA_user) || + a.dirty_sectors >= ca->mi.bucket_size || bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) continue; e = (struct copygc_heap_entry) { .dev = iter.pos.inode, - .gen = u.gen, - .replicas = 1 + u.stripe_redundancy, - .fragmentation = u.dirty_sectors * (1U << 15) - / ca->mi.bucket_size, - .sectors = u.dirty_sectors, - .offset = bucket_to_sector(ca, iter.pos.offset), + .gen = a.gen, + .replicas = 1 + a.stripe_redundancy, + .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31), + ca->mi.bucket_size), + .sectors = a.dirty_sectors, + .bucket = iter.pos.offset, }; heap_add_or_replace(h, e, -fragmentation_cmp, NULL); @@ -179,77 +88,22 @@ static int walk_buckets_to_copygc(struct bch_fs *c) return ret; } -static int bucket_inorder_cmp(const void *_l, const void *_r) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); -} - -static int check_copygc_was_done(struct bch_fs *c, - u64 *sectors_not_moved, - u64 *buckets_not_moved) -{ - copygc_heap *h = &c->copygc_heap; - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_alloc_unpacked u; - struct copygc_heap_entry *i; - int ret = 0; - - sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); - - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); - - for (i = h->data; i < h->data + h->used; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - - bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); - - ret = lockrestart_do(&trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (ret) - break; - - u = bch2_alloc_unpack(k); - - if (u.gen == i->gen && u.dirty_sectors) { - *sectors_not_moved += u.dirty_sectors; - *buckets_not_moved += 1; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; - struct copygc_heap_entry e, *i; + struct copygc_heap_entry e; struct bch_move_stats move_stats; - u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; - u64 sectors_reserved = 0; - u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; size_t heap_size = 0; - int ret; + struct moving_context ctxt; + struct data_update_opts data_opts = { + .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, + }; + int ret = 0; bch_move_stats_init(&move_stats, "copygc"); - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - h->used = 0; - for_each_rw_member(ca, c, dev_idx) heap_size += ca->mi.nbuckets >> 7; @@ -261,87 +115,58 @@ static int bch2_copygc(struct bch_fs *c) } } - for_each_rw_member(ca, c, dev_idx) { - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); - - spin_lock(&ca->fs->freelist_lock); - sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; - spin_unlock(&ca->fs->freelist_lock); - } - - ret = walk_buckets_to_copygc(c); + ret = find_buckets_to_copygc(c); if (ret) { bch2_fs_fatal_error(c, "error walking buckets to copygc!"); return ret; } if (!h->used) { - bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); + s64 wait = S64_MAX, dev_wait; + u64 dev_min_wait_fragmented = 0; + u64 dev_min_wait_allowed = 0; + int dev_min_wait = -1; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); + s64 fragmented = usage.d[BCH_DATA_user].fragmented; + + dev_wait = max(0LL, allowed - fragmented); + + if (dev_min_wait < 0 || dev_wait < wait) { + dev_min_wait = dev_idx; + dev_min_wait_fragmented = fragmented; + dev_min_wait_allowed = allowed; + } + } + + bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu", + dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed); return 0; } - /* - * Our btree node allocations also come out of RESERVE_MOVINGGC: - */ - sectors_reserved = (sectors_reserved * 3) / 4; - if (!sectors_reserved) { - bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); - return -1; - } + heap_resort(h, fragmentation_cmp, NULL); - for (i = h->data; i < h->data + h->used; i++) { - sectors_to_move += i->sectors; - sectors_to_write += i->sectors * i->replicas; - } + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, + writepoint_ptr(&c->copygc_write_point), + false); - while (sectors_to_write > sectors_reserved) { + /* not correct w.r.t. device removal */ + while (h->used && !ret) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - sectors_to_write -= e.sectors * e.replicas; + ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen, + data_opts); } - buckets_to_move = h->used; - - if (!buckets_to_move) { - bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", - sectors_reserved); - return 0; - } - - eytzinger0_sort(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, NULL); - - ret = bch2_move_data(c, - 0, POS_MIN, - BTREE_ID_NR, POS_MAX, - NULL, - writepoint_ptr(&c->copygc_write_point), - copygc_pred, NULL, - &move_stats); - if (ret) { - bch_err(c, "error %i from bch2_move_data() in copygc", ret); - return ret; - } + bch2_moving_ctxt_exit(&ctxt); - ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); - if (ret) { - bch_err(c, "error %i from check_copygc_was_done()", ret); - return ret; - } + if (ret < 0 && ret != -EROFS) + bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); - if (sectors_not_moved) - bch_warn_ratelimited(c, - "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", - sectors_not_moved, sectors_to_move, - buckets_not_moved, buckets_to_move, - atomic64_read(&move_stats.sectors_moved), - atomic64_read(&move_stats.keys_raced), - atomic64_read(&move_stats.sectors_raced)); - - trace_copygc(c, - atomic64_read(&move_stats.sectors_moved), sectors_not_moved, - buckets_to_move, buckets_not_moved); - return 0; + trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + return ret; } /* @@ -367,8 +192,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); - fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * - ca->mi.bucket_size) >> 1); + fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); fragmented = usage.d[BCH_DATA_user].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); @@ -382,10 +207,11 @@ static int bch2_copygc_thread(void *arg) struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; u64 last, wait; + int ret = 0; set_freezable(); - while (!kthread_should_stop()) { + while (!ret && !kthread_should_stop()) { cond_resched(); if (kthread_wait_freezable(c->copy_gc_enabled)) @@ -395,7 +221,7 @@ static int bch2_copygc_thread(void *arg) wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { - trace_copygc_wait(c, wait, last + wait); + trace_and_count(c, copygc_wait, c, wait, last + wait); c->copygc_wait = last + wait; bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -404,8 +230,11 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; - if (bch2_copygc(c)) - break; + c->copygc_running = true; + ret = bch2_copygc(c); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); } return 0; @@ -423,6 +252,7 @@ void bch2_copygc_stop(struct bch_fs *c) int bch2_copygc_start(struct bch_fs *c) { struct task_struct *t; + int ret; if (c->copygc_thread) return 0; @@ -434,9 +264,10 @@ int bch2_copygc_start(struct bch_fs *c) return -ENOMEM; t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); - if (IS_ERR(t)) { - bch_err(c, "error creating copygc thread: %li", PTR_ERR(t)); - return PTR_ERR(t); + ret = PTR_ERR_OR_ZERO(t); + if (ret) { + bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); + return ret; } get_task_struct(t); @@ -449,4 +280,6 @@ int bch2_copygc_start(struct bch_fs *c) void bch2_fs_copygc_init(struct bch_fs *c) { + init_waitqueue_head(&c->copygc_running_wq); + c->copygc_running = false; } diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h index 9227382..e85c813 100644 --- a/libbcachefs/movinggc.h +++ b/libbcachefs/movinggc.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H +unsigned long bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_stop(struct bch_fs *); int bch2_copygc_start(struct bch_fs *); void bch2_fs_copygc_init(struct bch_fs *); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 71bf26e..407b221 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -9,7 +9,14 @@ #include "super-io.h" #include "util.h" -#define x(t, n) #t, +#include + +#define x(t, n) [n] = #t, + +const char * const bch2_metadata_versions[] = { + BCH_METADATA_VERSIONS() + NULL +}; const char * const bch2_error_actions[] = { BCH_ERROR_ACTIONS() @@ -28,6 +35,7 @@ const char * const bch2_sb_compat[] = { const char * const bch2_btree_ids[] = { BCH_BTREE_IDS() + "interior btree node", NULL }; @@ -96,6 +104,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; +u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) +{ + BUG(); +} + +void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) +{ + BUG(); +} + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -209,62 +227,74 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } -static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) +int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { - if (msg) - pr_err("invalid %s%s: too small (min %llu)", - msg, opt->attr.name, opt->min); + if (err) + prt_printf(err, "%s: too small (min %llu)", + opt->attr.name, opt->min); return -ERANGE; } if (opt->max && v >= opt->max) { - if (msg) - pr_err("invalid %s%s: too big (max %llu)", - msg, opt->attr.name, opt->max); + if (err) + prt_printf(err, "%s: too big (max %llu)", + opt->attr.name, opt->max); return -ERANGE; } if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { - if (msg) - pr_err("invalid %s %s: not a multiple of 512", - msg, opt->attr.name); + if (err) + prt_printf(err, "%s: not a multiple of 512", + opt->attr.name); return -EINVAL; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { - if (msg) - pr_err("invalid %s%s: must be a power of two", - msg, opt->attr.name); + if (err) + prt_printf(err, "%s: must be a power of two", + opt->attr.name); return -EINVAL; } return 0; } -int bch2_opt_parse(struct bch_fs *c, const char *msg, +int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, - const char *val, u64 *res) + const char *val, u64 *res, + struct printbuf *err) { ssize_t ret; switch (opt->type) { case BCH_OPT_BOOL: ret = kstrtou64(val, 10, res); - if (ret < 0) + if (ret < 0 || (*res != 0 && *res != 1)) { + prt_printf(err, "%s: must be bool", + opt->attr.name); return ret; + } break; case BCH_OPT_UINT: ret = opt->flags & OPT_HUMAN_READABLE ? bch2_strtou64_h(val, res) : kstrtou64(val, 10, res); - if (ret < 0) + if (ret < 0) { + if (err) + prt_printf(err, "%s: must be a number", + opt->attr.name); return ret; + } break; case BCH_OPT_STR: ret = match_string(opt->choices, -1, val); - if (ret < 0) + if (ret < 0) { + if (err) + prt_printf(err, "%s: invalid selection", + opt->attr.name); return ret; + } *res = ret; break; @@ -273,44 +303,49 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg, return 0; ret = opt->parse(c, val, res); - if (ret < 0) + if (ret < 0) { + if (err) + prt_printf(err, "%s: parse error", + opt->attr.name); return ret; + } } - return bch2_opt_validate(opt, msg, *res); + return bch2_opt_validate(opt, *res, err); } -void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, +void bch2_opt_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_sb *sb, const struct bch_option *opt, u64 v, unsigned flags) { if (flags & OPT_SHOW_MOUNT_STYLE) { if (opt->type == BCH_OPT_BOOL) { - pr_buf(out, "%s%s", + prt_printf(out, "%s%s", v ? "" : "no", opt->attr.name); return; } - pr_buf(out, "%s=", opt->attr.name); + prt_printf(out, "%s=", opt->attr.name); } switch (opt->type) { case BCH_OPT_BOOL: case BCH_OPT_UINT: if (opt->flags & OPT_HUMAN_READABLE) - bch2_hprint(out, v); + prt_human_readable_u64(out, v); else - pr_buf(out, "%lli", v); + prt_printf(out, "%lli", v); break; case BCH_OPT_STR: if (flags & OPT_SHOW_FULL_LIST) - bch2_string_opt_to_text(out, opt->choices, v); + prt_string_option(out, opt->choices, v); else - pr_buf(out, opt->choices[v]); + prt_printf(out, "%s", opt->choices[v]); break; case BCH_OPT_FN: - opt->to_text(out, c, v); + opt->to_text(out, c, sb, v); break; default: BUG(); @@ -356,6 +391,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, char *copied_opts, *copied_opts_start; char *opt, *name, *val; int ret, id; + struct printbuf err = PRINTBUF; u64 v; if (!options) @@ -375,8 +411,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, if (id < 0) goto bad_opt; - ret = bch2_opt_parse(c, "mount option ", - &bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); if (ret < 0) goto bad_val; } else { @@ -419,7 +454,7 @@ bad_opt: ret = -1; goto out; bad_val: - pr_err("Invalid value %s for mount option %s", val, name); + pr_err("Invalid mount option %s", err.buf); ret = -1; goto out; no_val: @@ -428,9 +463,26 @@ no_val: goto out; out: kfree(copied_opts_start); + printbuf_exit(&err); return ret; } +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +{ + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + return v; +} + /* * Initial options from superblock - here we don't want any options undefined, * any options the superblock doesn't specify are set to 0: @@ -438,28 +490,14 @@ out: int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { unsigned id; - int ret; for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - u64 v; - if (opt->get_sb == NO_SB_OPT) + if (opt->get_sb == BCH2_NO_SB_OPT) continue; - v = opt->get_sb(sb); - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v <<= 9; - - ret = bch2_opt_validate(opt, "superblock option ", v); - if (ret) - return ret; - - bch2_opt_set_by_id(opts, id, v); + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); } return 0; @@ -467,7 +505,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_NO_SB_OPT) + if (opt->set_sb == SET_BCH2_NO_SB_OPT) return; if (opt->flags & OPT_SB_FIELD_SECTORS) @@ -481,7 +519,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_NO_SB_OPT) + if (opt->set_sb == SET_BCH2_NO_SB_OPT) return; mutex_lock(&c->sb_lock); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index affe923..5b8586e 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -8,6 +8,7 @@ #include #include "bcachefs_format.h" +extern const char * const bch2_metadata_versions[]; extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; @@ -42,7 +43,8 @@ static inline const char *bch2_d_type_str(unsigned d_type) */ /* dummy option, for options that aren't stored in the superblock */ -LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); +u64 BCH2_NO_SB_OPT(const struct bch_sb *); +void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); /* When can be set: */ enum opt_flags { @@ -163,22 +165,22 @@ enum opt_type { OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ - "(target)", "Device or disk group for metadata writes") \ + "(target)", "Device or label for metadata writes") \ x(foreground_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ - "(target)", "Device or disk group for foreground writes") \ + "(target)", "Device or label for foreground writes") \ x(background_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ - "(target)", "Device or disk group to move data to in the background")\ + "(target)", "Device or label to move data to in the background")\ x(promote_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ - "(target)", "Device or disk group to promote data to on read")\ + "(target)", "Device or label to promote data to on read") \ x(erasure_code, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -202,7 +204,7 @@ enum opt_type { x(btree_node_mem_ptr_optimization, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, true, \ + BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -229,7 +231,7 @@ enum opt_type { x(inline_data, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, true, \ + BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ @@ -254,26 +256,26 @@ enum opt_type { x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ x(very_degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, true, \ NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Extra debugging information during mount/recovery")\ x(journal_flush_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U32_MAX), \ + OPT_UINT(1, U32_MAX), \ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ NULL, "Delay in milliseconds before automatic journal commits")\ x(journal_flush_disabled, u8, \ @@ -288,107 +290,112 @@ enum opt_type { OPT_UINT(0, U32_MAX), \ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(move_bytes_in_flight, u32, \ + OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1024, U32_MAX), \ + BCH2_NO_SB_OPT, 1U << 20, \ + NULL, "Amount of IO in flight to keep in flight by the move path")\ x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Fix errors during fsck without asking") \ x(ratelimit_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Super read only mode - no writes at all will be issued,\n"\ "even if we have to replay the journal") \ x(norecovery, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ - x(rebuild_replicas, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false, \ - NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't free journal entries/keys after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ - x(journal_transaction_names, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + x(read_journal_only, u8, \ + 0, \ OPT_BOOL(), \ - BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ - NULL, "Log transaction function names in journal") \ + BCH2_NO_SB_OPT, false, \ + NULL, "Only read the journal, skip the rest of recovery")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't open device in exclusive mode") \ + x(direct_io, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Use O_DIRECT (userspace only)") \ x(sb, u64, \ OPT_MOUNT, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, BCH_SB_SECTOR, \ + BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ OPT_FS, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nostart, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ x(reconstruct_alloc, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ x(buckets_nouse, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(fs_size, u64, \ OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, 0, \ + BCH2_NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(bucket, u32, \ OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, 0, \ + BCH2_NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(durability, u8, \ OPT_DEVICE, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - NO_SB_OPT, 1, \ + BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") @@ -447,17 +454,9 @@ struct bch_option { enum opt_flags flags; u64 min, max; - union { - struct { - }; - struct { - const char * const *choices; - }; - struct { - int (*parse)(struct bch_fs *, const char *, u64 *); - void (*to_text)(struct printbuf *, struct bch_fs *, u64); - }; - }; + const char * const *choices; + int (*parse)(struct bch_fs *, const char *, u64 *); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); const char *hint; const char *help; @@ -470,18 +469,20 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); -int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, - const char *, u64 *); +int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); +int bch2_opt_parse(struct bch_fs *, const struct bch_option *, + const char *, u64 *, struct printbuf *); #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) -void bch2_opt_to_text(struct printbuf *, struct bch_fs *, +void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 6fb8224..db81727 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -1,44 +1,81 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" +#include "errcode.h" #include "inode.h" #include "quota.h" #include "subvolume.h" #include "super-io.h" -static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f, +static const char * const bch2_quota_types[] = { + "user", + "group", + "project", +}; + +static const char * const bch2_quota_counters[] = { + "space", + "inodes", +}; + +static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); if (vstruct_bytes(&q->field) < sizeof(*q)) { - pr_buf(err, "wrong size (got %llu should be %zu)", + prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&q->field), sizeof(*q)); + return -EINVAL; } return 0; } +static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + unsigned qtyp, counter; + + for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { + prt_printf(out, "%s: flags %llx", + bch2_quota_types[qtyp], + le64_to_cpu(q->q[qtyp].flags)); + + for (counter = 0; counter < Q_COUNTERS; counter++) + prt_printf(out, " %s timelimit %u warnlimit %u", + bch2_quota_counters[counter], + le32_to_cpu(q->q[qtyp].c[counter].timelimit), + le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); + + prt_newline(out); + } +} + const struct bch_sb_field_ops bch_sb_field_ops_quota = { - .validate = bch2_sb_validate_quota, + .validate = bch2_sb_quota_validate, + .to_text = bch2_sb_quota_to_text, }; -const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (k.k->p.inode >= QTYP_NR) - return "invalid quota type"; + if (k.k->p.inode >= QTYP_NR) { + prt_printf(err, "invalid quota type (%llu >= %u)", + k.k->p.inode, QTYP_NR); + return -EINVAL; + } - if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_quota)); + return -EINVAL; + } - return NULL; + return 0; } -static const char * const bch2_quota_counters[] = { - "space", - "inodes", -}; - void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -46,7 +83,7 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, unsigned i; for (i = 0; i < Q_COUNTERS; i++) - pr_buf(out, "%s hardlimit %llu softlimit %llu", + prt_printf(out, "%s hardlimit %llu softlimit %llu", bch2_quota_counters[i], le64_to_cpu(dq.v->c[i].hardlimit), le64_to_cpu(dq.v->c[i].softlimit)); @@ -58,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, #include #include +static void qc_info_to_text(struct printbuf *out, struct qc_info *i) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "i_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", i->i_fieldmask); + prt_newline(out); + + prt_str(out, "i_flags"); + prt_tab(out); + prt_printf(out, "%u", i->i_flags); + prt_newline(out); + + prt_str(out, "i_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_ino_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_timelimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_warnlimit); + prt_newline(out); + + prt_str(out, "i_ino_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_warnlimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_warnlimit); + prt_newline(out); +} + +static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "d_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", q->d_fieldmask); + prt_newline(out); + + prt_str(out, "d_spc_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_hardlimit); + prt_newline(out); + + prt_str(out, "d_spc_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_softlimit); + prt_newline(out); + + prt_str(out, "d_ino_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_hardlimit); + prt_newline(out); + + prt_str(out, "d_ino_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_softlimit); + prt_newline(out); + + prt_str(out, "d_space"); + prt_tab(out); + prt_printf(out, "%llu", q->d_space); + prt_newline(out); + + prt_str(out, "d_ino_count"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_count); + prt_newline(out); + + prt_str(out, "d_ino_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_timer); + prt_newline(out); + + prt_str(out, "d_spc_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_timer); + prt_newline(out); + + prt_str(out, "d_ino_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_ino_warns); + prt_newline(out); + + prt_str(out, "d_spc_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_spc_warns); + prt_newline(out); +} + static inline unsigned __next_qtype(unsigned i, unsigned qtypes) { qtypes >>= i; @@ -188,34 +332,20 @@ static int bch2_quota_check_limit(struct bch_fs *c, if (qc->hardlimit && qc->hardlimit < n && !ignore_hardlimit(q)) { - if (mode == KEY_TYPE_QUOTA_PREALLOC) - return -EDQUOT; - prepare_warning(qc, qtype, counter, msgs, HARDWARN); + return -EDQUOT; } if (qc->softlimit && - qc->softlimit < n && - qc->timer && - ktime_get_real_seconds() >= qc->timer && - !ignore_hardlimit(q)) { - if (mode == KEY_TYPE_QUOTA_PREALLOC) - return -EDQUOT; - - prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); - } - - if (qc->softlimit && - qc->softlimit < n && - qc->timer == 0) { - if (mode == KEY_TYPE_QUOTA_PREALLOC) + qc->softlimit < n) { + if (qc->timer == 0) { + qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; + prepare_warning(qc, qtype, counter, msgs, SOFTWARN); + } else if (ktime_get_real_seconds() >= qc->timer && + !ignore_hardlimit(q)) { + prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); return -EDQUOT; - - prepare_warning(qc, qtype, counter, msgs, SOFTWARN); - - /* XXX is this the right one? */ - qc->timer = ktime_get_real_seconds() + - q->limits[counter].warnlimit; + } } return 0; @@ -325,7 +455,8 @@ err: return ret; } -static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) +static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, + struct qc_dqblk *qdq) { struct bkey_s_c_quota dq; struct bch_memquota_type *q; @@ -334,6 +465,9 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) BUG_ON(k.k->p.inode >= QTYP_NR); + if (!((1U << k.k->p.inode) & enabled_qtypes(c))) + return 0; + switch (k.k->type) { case KEY_TYPE_quota: dq = bkey_s_c_to_quota(k); @@ -351,36 +485,21 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); } + if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) + mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer); + if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) + mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns); + if (qdq && qdq->d_fieldmask & QC_INO_TIMER) + mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer); + if (qdq && qdq->d_fieldmask & QC_INO_WARNS) + mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns); + mutex_unlock(&q->lock); } return 0; } -static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0), - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->p.inode != type) - break; - - ret = __bch2_quota_set(c, k); - if (ret) - break; - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - void bch2_fs_quota_exit(struct bch_fs *c) { unsigned i; @@ -397,6 +516,26 @@ void bch2_fs_quota_init(struct bch_fs *c) mutex_init(&c->quotas[i].lock); } +static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) +{ + struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); + + if (sb_quota) + return sb_quota; + + sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); + if (sb_quota) { + unsigned qtype, qc; + + for (qtype = 0; qtype < QTYP_NR; qtype++) + for (qc = 0; qc < Q_COUNTERS; qc++) + sb_quota->q[qtype].c[qc].timelimit = + cpu_to_le32(7 * 24 * 60 * 60); + } + + return sb_quota; +} + static void bch2_sb_quota_read(struct bch_fs *c) { struct bch_sb_field_quota *sb_quota; @@ -419,22 +558,14 @@ static void bch2_sb_quota_read(struct bch_fs *c) } static int bch2_fs_quota_read_inode(struct btree_trans *trans, - struct btree_iter *iter) + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct bch_subvolume subvolume; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek(iter); - ret = bkey_err(k); - if (ret) - return ret; - - if (!k.k) - return 1; - ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); if (ret) return ret; @@ -463,36 +594,35 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { - unsigned i, qtypes = enabled_qtypes(c); - struct bch_memquota_type *q; + struct bch_sb_field_quota *sb_quota; struct btree_trans trans; struct btree_iter iter; + struct bkey_s_c k; int ret; mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + mutex_unlock(&c->sb_lock); + return -BCH_ERR_ENOSPC_sb_quota; + } + bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_init_type(c, i); - if (ret) - return ret; - } - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - do { - ret = lockrestart_do(&trans, - bch2_fs_quota_read_inode(&trans, &iter)); - } while (!ret); - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, + POS_MIN, BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_inodes, + POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(&trans, &iter, k)); + if (ret) + bch_err(c, "err in quota_read: %s", bch2_err_str(ret)); bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + return ret; } /* Enable/disable/delete quotas for an entire filesystem: */ @@ -500,6 +630,8 @@ int bch2_fs_quota_read(struct bch_fs *c) static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + int ret = 0; if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -519,6 +651,12 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) return -EINVAL; mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + if (uflags & FS_QUOTA_UDQ_ENFD) SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); @@ -529,9 +667,10 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return bch2_err_class(ret); } static int bch2_quota_disable(struct super_block *sb, unsigned uflags) @@ -643,6 +782,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type, struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; struct bch_memquota_type *q; + int ret = 0; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_info_to_text(&buf, info); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -660,12 +808,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type, q = &c->quotas[type]; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_quota(c->disk_sb.sb); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - sb_quota = bch2_sb_resize_quota(&c->disk_sb, - sizeof(*sb_quota) / sizeof(u64)); - if (!sb_quota) - return -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; } if (info->i_fieldmask & QC_SPC_TIMER) @@ -687,9 +833,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type, bch2_sb_quota_read(c); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return bch2_err_class(ret); } /* Get/set individual quotas: */ @@ -794,6 +941,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct bkey_i_quota new_quota; int ret; + if (0) { + struct printbuf buf = PRINTBUF; + + qc_dqblk_to_text(&buf, qdq); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -802,7 +957,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, ret = bch2_trans_do(c, NULL, NULL, 0, bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: - __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); return ret; } diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 51e4f97..8c67ae1 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -7,7 +7,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota (struct bkey_ops) { \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index a573fed..17b289b 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -6,6 +6,7 @@ #include "buckets.h" #include "clock.h" #include "disk_groups.h" +#include "errcode.h" #include "extents.h" #include "io.h" #include "move.h" @@ -22,62 +23,70 @@ * returns -1 if it should not be moved, or * device of pointer that should be moved, if known, or INT_MAX if unknown */ -static int __bch2_rebalance_pred(struct bch_fs *c, - struct bkey_s_c k, - struct bch_io_opts *io_opts) +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + unsigned i; + + data_opts->rewrite_ptrs = 0; + data_opts->target = io_opts->background_target; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; if (io_opts->background_compression && - !bch2_bkey_is_incompressible(k)) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + !bch2_bkey_is_incompressible(k)) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached && p.crc.compression_type != bch2_compression_opt_to_type[io_opts->background_compression]) - return p.ptr.dev; + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } - if (io_opts->background_target) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) - return p.ptr.dev; + if (io_opts->background_target) { + const struct bch_extent_ptr *ptr; + + i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target)) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } - return -1; + return data_opts->rewrite_ptrs != 0; } void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - atomic64_t *counter; - int dev; + struct data_update_opts update_opts = { 0 }; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned i; - dev = __bch2_rebalance_pred(c, k, io_opts); - if (dev < 0) + if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) return; - counter = dev < INT_MAX - ? &bch_dev_bkey_exists(c, dev)->rebalance_work - : &c->rebalance.work_unknown_dev; - - if (atomic64_add_return(k.k->size, counter) == k.k->size) - rebalance_wakeup(c); -} - -static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { - data_opts->target = io_opts->background_target; - data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; - } else { - return DATA_SKIP; + i = 0; + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + if ((1U << i) && update_opts.rewrite_ptrs) + if (atomic64_add_return(k.k->size, + &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == + k.k->size) + rebalance_wakeup(c); + i++; } } @@ -245,9 +254,10 @@ static int bch2_rebalance_thread(void *arg) BTREE_ID_NR, POS_MAX, /* ratelimiting disabled for now */ NULL, /* &r->pd.rate, */ + &move_stats, writepoint_ptr(&c->rebalance_write_point), - rebalance_pred, NULL, - &move_stats); + true, + rebalance_pred, NULL); } return 0; @@ -257,35 +267,48 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) { struct bch_fs_rebalance *r = &c->rebalance; struct rebalance_work w = rebalance_work(c); - char h1[21], h2[21]; - bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); - bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); - pr_buf(out, "fullest_dev (%i):\t%s/%s\n", - w.dev_most_full_idx, h1, h2); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); + prt_tab(out); + + prt_human_readable_u64(out, w.dev_most_full_work << 9); + prt_printf(out, "/"); + prt_human_readable_u64(out, w.dev_most_full_capacity << 9); + prt_newline(out); + + prt_printf(out, "total work:"); + prt_tab(out); - bch2_hprint(&PBUF(h1), w.total_work << 9); - bch2_hprint(&PBUF(h2), c->capacity << 9); - pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); + prt_human_readable_u64(out, w.total_work << 9); + prt_printf(out, "/"); + prt_human_readable_u64(out, c->capacity << 9); + prt_newline(out); - pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); + prt_printf(out, "rate:"); + prt_tab(out); + prt_printf(out, "%u", r->pd.rate.rate); + prt_newline(out); switch (r->state) { case REBALANCE_WAITING: - pr_buf(out, "waiting\n"); + prt_printf(out, "waiting"); break; case REBALANCE_THROTTLED: - bch2_hprint(&PBUF(h1), + prt_printf(out, "throttled for %lu sec or ", + (r->throttled_until_cputime - jiffies) / HZ); + prt_human_readable_u64(out, (r->throttled_until_iotime - atomic64_read(&c->io_clock[WRITE].now)) << 9); - pr_buf(out, "throttled for %lu sec or %s io\n", - (r->throttled_until_cputime - jiffies) / HZ, - h1); + prt_printf(out, " io"); break; case REBALANCE_RUNNING: - pr_buf(out, "running\n"); + prt_printf(out, "running"); break; } + prt_newline(out); } void bch2_rebalance_stop(struct bch_fs *c) @@ -310,6 +333,7 @@ void bch2_rebalance_stop(struct bch_fs *c) int bch2_rebalance_start(struct bch_fs *c) { struct task_struct *p; + int ret; if (c->rebalance.thread) return 0; @@ -318,9 +342,10 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - if (IS_ERR(p)) { - bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p)); - return PTR_ERR(p); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); + return ret; } get_task_struct(p); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 543db58..ea7810a 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" @@ -10,12 +11,14 @@ #include "buckets.h" #include "dirent.h" #include "ec.h" +#include "errcode.h" #include "error.h" #include "fs-common.h" #include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "lru.h" #include "move.h" #include "quota.h" #include "recovery.h" @@ -71,40 +74,119 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -size_t bch2_journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) { - size_t l = 0, r = journal_keys->nr, m; + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->d + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; while (l < r) { m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) l = m + 1; else r = m; } - BUG_ON(l < journal_keys->nr && - __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); BUG_ON(l && - __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); return l; } -static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) { - struct bkey_i *n = iter->keys->d[idx].k; - struct btree_and_journal_iter *biter = - container_of(iter, struct btree_and_journal_iter, journal); - - if (iter->idx > idx || - (iter->idx == idx && - biter->last && - bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) - iter->idx++; + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx < keys->nr && + (k = idx_to_key(keys, *idx), + k->btree_id == btree_id && + k->level == level && + bpos_cmp(k->k->k.p, end_pos) <= 0)) { + if (bpos_cmp(k->k->k.p, pos) >= 0 && + !k->overwritten) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct btree_and_journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, journal.list) + if (iter->journal.idx == gap_end) + iter->journal.idx = keys->gap - 1; +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } } int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, @@ -122,12 +204,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .journal_seq = U32_MAX, }; struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); BUG_ON(test_bit(BCH_FS_RW, &c->flags)); - if (idx < keys->nr && + if (idx < keys->size && journal_key_cmp(&n, &keys->d[idx]) == 0) { if (keys->d[idx].allocated) kfree(keys->d[idx].k); @@ -135,29 +216,40 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, return 0; } + if (idx > keys->gap) + idx -= keys->size - keys->nr; + if (keys->nr == keys->size) { struct journal_keys new_keys = { .nr = keys->nr, - .size = keys->size * 2, - .journal_seq_base = keys->journal_seq_base, + .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); + new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); if (!new_keys.d) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); return -ENOMEM; } + /* Since @keys was full, there was no gap: */ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); kvfree(keys->d); *keys = new_keys; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; } - array_insert_item(keys->d, keys->nr, idx, n); + journal_iters_move_gap(c, keys->gap, idx); - list_for_each_entry(iter, &c->journal_iters, list) - journal_iter_fix(c, iter, idx); + move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + keys->gap = idx; + + keys->nr++; + keys->d[keys->gap++] = n; + + journal_iters_fix(c); return 0; } @@ -201,34 +293,37 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, btree, level, pos); - if (idx < keys->nr && + if (idx < keys->size && keys->d[idx].btree_id == btree && keys->d[idx].level == level && !bpos_cmp(keys->d[idx].k->k.p, pos)) keys->d[idx].overwritten = true; } -static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { struct journal_key *k = iter->keys->d + iter->idx; - while (k < iter->keys->d + iter->keys->nr && + while (k < iter->keys->d + iter->keys->size && k->btree_id == iter->btree_id && k->level == iter->level) { if (!k->overwritten) - return k->k; + return bkey_i_to_s_c(k->k); - iter->idx++; + bch2_journal_iter_advance(iter); k = iter->keys->d + iter->idx; } - return NULL; -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->nr) - iter->idx++; + return bkey_s_c_null; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -260,71 +355,49 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) { - switch (iter->last) { - case none: - break; - case btree: - bch2_journal_iter_advance_btree(iter); - break; - case journal: - bch2_journal_iter_advance(&iter->journal); - break; - } - - iter->last = none; + if (!bpos_cmp(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); } struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c ret; - - while (1) { - struct bkey_s_c btree_k = - bch2_journal_iter_peek_btree(iter); - struct bkey_s_c journal_k = - bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); + struct bkey_s_c btree_k, journal_k, ret; +again: + if (iter->at_end) + return bkey_s_c_null; - if (btree_k.k && journal_k.k) { - int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p); + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_cmp(btree_k.k->p, iter->pos) < 0) + bch2_journal_iter_advance_btree(iter); - if (!cmp) - bch2_journal_iter_advance_btree(iter); + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_cmp(journal_k.k->p, iter->pos) < 0) + bch2_journal_iter_advance(&iter->journal); - iter->last = cmp < 0 ? btree : journal; - } else if (btree_k.k) { - iter->last = btree; - } else if (journal_k.k) { - iter->last = journal; - } else { - iter->last = none; - return bkey_s_c_null; - } + ret = journal_k.k && + (!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0) + ? journal_k + : btree_k; - ret = iter->last == journal ? journal_k : btree_k; + if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) + ret = bkey_s_c_null; - if (iter->b && - bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) { - iter->journal.idx = iter->journal.keys->nr; - iter->last = none; - return bkey_s_c_null; + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; } - - if (!bkey_deleted(ret.k)) - break; - - bch2_btree_and_journal_iter_advance(iter); + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; } return ret; } -struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) -{ - bch2_btree_and_journal_iter_advance(iter); - - return bch2_btree_and_journal_iter_peek(iter); -} - void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) { bch2_journal_iter_exit(&iter->journal); @@ -342,6 +415,8 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter iter->node_iter = node_iter; bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); INIT_LIST_HEAD(&iter->journal.list); + iter->pos = b->data->min_key; + iter->at_end = false; } /* @@ -361,16 +436,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i /* sort and dedup all keys in the journal: */ -void bch2_journal_entries_free(struct list_head *list) +void bch2_journal_entries_free(struct bch_fs *c) { - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + if (*i) + kvpfree(*i, offsetof(struct journal_replay, j) + + vstruct_bytes(&(*i)->j)); + genradix_free(&c->journal_entries); } /* @@ -390,66 +465,68 @@ void bch2_journal_keys_free(struct journal_keys *keys) { struct journal_key *i; + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + for (i = keys->d; i < keys->d + keys->nr; i++) if (i->allocated) kfree(i->k); kvfree(keys->d); keys->d = NULL; - keys->nr = 0; + keys->nr = keys->gap = keys->size = 0; } -static struct journal_keys journal_keys_sort(struct list_head *journal_entries) +static int journal_keys_sort(struct bch_fs *c) { - struct journal_replay *i; + struct genradix_iter iter; + struct journal_replay *i, **_i; struct jset_entry *entry; struct bkey_i *k, *_n; - struct journal_keys keys = { NULL }; + struct journal_keys *keys = &c->journal_keys; struct journal_key *src, *dst; size_t nr_keys = 0; - if (list_empty(journal_entries)) - return keys; + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; - list_for_each_entry(i, journal_entries, list) { - if (i->ignore) + if (!i || i->ignore) continue; - if (!keys.journal_seq_base) - keys.journal_seq_base = le64_to_cpu(i->j.seq); - for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } - keys.size = roundup_pow_of_two(nr_keys); + if (!nr_keys) + return 0; - keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); - if (!keys.d) - goto err; + keys->size = roundup_pow_of_two(nr_keys); - list_for_each_entry(i, journal_entries, list) { - if (i->ignore) - continue; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) + return -ENOMEM; - BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; for_each_jset_key(k, _n, entry, &i->j) - keys.d[keys.nr++] = (struct journal_key) { + keys->d[keys->nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(i->j.seq) - - keys.journal_seq_base, + .journal_seq = le64_to_cpu(i->j.seq), .journal_offset = k->_data - i->j._data, }; } - sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); - src = dst = keys.d; - while (src < keys.d + keys.nr) { - while (src + 1 < keys.d + keys.nr && + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && src[0].btree_id == src[1].btree_id && src[0].level == src[1].level && !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) @@ -458,9 +535,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) *dst++ = *src++; } - keys.nr = dst - keys.d; -err: - return keys; + keys->nr = dst - keys->d; + keys->gap = keys->nr; + return 0; } /* journal replay: */ @@ -468,7 +545,8 @@ err: static void replay_now_at(struct journal *j, u64 seq) { BUG_ON(seq < j->replay_journal_seq); - BUG_ON(seq > j->replay_journal_seq_end); + + seq = min(seq, j->replay_journal_seq_end); while (j->replay_journal_seq < seq) bch2_journal_pin_put(j, j->replay_journal_seq++); @@ -519,6 +597,9 @@ static int bch2_journal_replay(struct bch_fs *c) size_t i; int ret; + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); if (!keys_sorted) return -ENOMEM; @@ -530,22 +611,19 @@ static int bch2_journal_replay(struct bch_fs *c) sizeof(keys_sorted[0]), journal_sort_seq_cmp, NULL); - if (keys->nr) - replay_now_at(j, keys->journal_seq_base); - for (i = 0; i < keys->nr; i++) { k = keys_sorted[i]; cond_resched(); - if (!k->allocated) - replay_now_at(j, keys->journal_seq_base + k->journal_seq); + replay_now_at(j, k->journal_seq); ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), + (!k->allocated + ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved + : 0), bch2_journal_replay_key(&trans, k)); if (ret) { bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", @@ -560,6 +638,9 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); bch2_journal_flush_all_pins(j); ret = bch2_journal_error(j); + + if (keys->nr && !ret) + bch2_journal_log_msg(&c->journal, "journal replay finished"); err: kvfree(keys_sorted); return ret; @@ -630,7 +711,6 @@ static int journal_replay_entry_early(struct bch_fs *c, unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); - ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); @@ -670,10 +750,8 @@ static int journal_replay_entry_early(struct bch_fs *c, } static int journal_replay_early(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct list_head *journal) + struct bch_sb_field_clean *clean) { - struct journal_replay *i; struct jset_entry *entry; int ret; @@ -686,8 +764,13 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - list_for_each_entry(i, journal, list) { - if (i->ignore) + struct genradix_iter iter; + struct journal_replay *i, **_i; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) continue; vstruct_for_each(&i->j, entry) { @@ -742,6 +825,8 @@ static int verify_superblock_clean(struct bch_fs *c, { unsigned i; struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; int ret = 0; if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, @@ -754,7 +839,6 @@ static int verify_superblock_clean(struct bch_fs *c, } for (i = 0; i < BTREE_ID_NR; i++) { - char buf1[200], buf2[200]; struct bkey_i *k1, *k2; unsigned l1 = 0, l2 = 0; @@ -764,6 +848,19 @@ static int verify_superblock_clean(struct bch_fs *c, if (!k1 && !k2) continue; + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + prt_printf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + prt_printf(&buf2, "(none)"); + mustfix_fsck_err_on(!k1 || !k2 || IS_ERR(k1) || IS_ERR(k2) || @@ -773,10 +870,12 @@ static int verify_superblock_clean(struct bch_fs *c, "superblock btree root %u doesn't match journal after clean shutdown\n" "sb: l=%u %s\n" "journal: l=%u %s\n", i, - l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), - l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); + l1, buf1.buf, + l2, buf2.buf); } fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -803,7 +902,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) return ERR_PTR(-ENOMEM); } - ret = bch2_sb_clean_validate(c, clean, READ); + ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { mutex_unlock(&c->sb_lock); return ERR_PTR(ret); @@ -817,6 +916,19 @@ fsck_err: return ERR_PTR(ret); } +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + return true; + default: + return false; + } +} + static int read_btree_roots(struct bch_fs *c) { unsigned i; @@ -828,14 +940,14 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (i == BTREE_ID_alloc && + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); continue; } if (r->error) { - __fsck_err(c, i == BTREE_ID_alloc + __fsck_err(c, btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, "invalid btree root %s", bch2_btree_ids[i]); @@ -845,7 +957,8 @@ static int read_btree_roots(struct bch_fs *c) ret = bch2_btree_root_read(c, i, &r->key, r->level); if (ret) { - __fsck_err(c, i == BTREE_ID_alloc + __fsck_err(c, + btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, "error reading btree root %s", bch2_btree_ids[i]); @@ -881,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c) if (ret) return ret; - bkey_subvolume_init(&root_volume.k_i); root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; root_volume.v.flags = 0; @@ -974,28 +1086,24 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.fix_errors = FSCK_OPT_YES; } - if (!c->replicas.entries || - c->opts.rebuild_replicas) { - bch_info(c, "building replicas info"); - set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - } - if (!c->opts.nochanges) { - if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { - bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); + if (c->sb.version < bcachefs_metadata_version_backpointers) { + bch_info(c, "version prior to backpointers, upgrade and fsck required"); c->opts.version_upgrade = true; c->opts.fsck = true; c->opts.fix_errors = FSCK_OPT_YES; - } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { - bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); - c->opts.version_upgrade = true; - c->opts.fsck = true; - } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { - bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); - c->opts.version_upgrade = true; + } else if (c->sb.version < bcachefs_metadata_version_inode_v3) { + bch_info(c, "version prior to inode_v3, upgrade required"); + c->opts.version_upgrade = true; } } + if (c->opts.fsck && c->opts.norecovery) { + bch_err(c, "cannot select both norecovery and fsck"); + ret = -EINVAL; + goto err; + } + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -1003,17 +1111,17 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct journal_replay *i; + struct genradix_iter iter; + struct journal_replay **i; bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &c->journal_entries, - &blacklist_seq, &journal_seq); + ret = bch2_journal_read(c, &blacklist_seq, &journal_seq); if (ret) goto err; - list_for_each_entry_reverse(i, &c->journal_entries, list) - if (!i->ignore) { - last_journal_entry = &i->j; + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i && !(*i)->ignore) { + last_journal_entry = &(*i)->j; break; } @@ -1031,11 +1139,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto use_clean; } - c->journal_keys = journal_keys_sort(&c->journal_entries); - if (!c->journal_keys.d) { - ret = -ENOMEM; + ret = journal_keys_sort(c); + if (ret) goto err; - } if (c->sb.clean && last_journal_entry) { ret = verify_superblock_clean(c, &clean, @@ -1047,7 +1153,7 @@ int bch2_fs_recovery(struct bch_fs *c) use_clean: if (!clean) { bch_err(c, "no superblock clean section found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + ret = -BCH_ERR_fsck_repair_impossible; goto err; } @@ -1061,7 +1167,7 @@ use_clean: zero_out_btree_mem_ptr(&c->journal_keys); - ret = journal_replay_early(c, clean, &c->journal_entries); + ret = journal_replay_early(c, clean); if (ret) goto err; @@ -1084,11 +1190,24 @@ use_clean: } } - ret = bch2_fs_journal_start(&c->journal, journal_seq, - &c->journal_entries); + /* + * note: cmd_list_journal needs the blacklist table fully up to date so + * it can asterisk ignored journal entries: + */ + if (c->opts.read_journal_only) + goto out; + + ret = bch2_fs_journal_start(&c->journal, journal_seq); if (ret) goto err; + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type && !c->sb.clean) + atomic64_add(1 << 16, &c->key_version); + ret = read_btree_roots(c); if (ret) goto err; @@ -1097,7 +1216,7 @@ use_clean: err = "error reading allocation information"; down_read(&c->gc_lock); - ret = bch2_alloc_read(c, false, false); + ret = bch2_alloc_read(c); up_read(&c->gc_lock); if (ret) @@ -1111,51 +1230,98 @@ use_clean: goto err; bch_verbose(c, "stripes_read done"); - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - /* - * If we're not running fsck, this ensures bch2_fsck_err() calls are - * instead interpreted as bch2_inconsistent_err() calls: - */ - if (!c->opts.fsck) - set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_stripes_heap_start(c); - if (c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || - test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + if (c->opts.fsck) { bool metadata_only = c->opts.norecovery; bch_info(c, "checking allocations"); - err = "error in mark and sweep"; + err = "error checking allocations"; ret = bch2_gc(c, true, metadata_only); if (ret) goto err; bch_verbose(c, "done checking allocations"); - } - bch2_stripes_heap_start(c); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + bch_info(c, "checking need_discard and freespace btrees"); + err = "error checking need_discard and freespace btrees"; + ret = bch2_check_alloc_info(c); + if (ret) + goto err; + bch_verbose(c, "done checking need_discard and freespace btrees"); - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type && !c->sb.clean) - atomic64_add(1 << 16, &c->key_version); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); - if (c->opts.norecovery) - goto out; + bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); + if (ret) + goto err; + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); + + bch_info(c, "checking lrus"); + err = "error checking lrus"; + ret = bch2_check_lrus(c); + if (ret) + goto err; + bch_verbose(c, "done checking lrus"); + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + + bch_info(c, "checking backpointers to alloc keys"); + err = "error checking backpointers to alloc keys"; + ret = bch2_check_btree_backpointers(c); + if (ret) + goto err; + bch_verbose(c, "done checking backpointers to alloc keys"); + + bch_info(c, "checking backpointers to extents"); + err = "error checking backpointers to extents"; + ret = bch2_check_backpointers_to_extents(c); + if (ret) + goto err; + bch_verbose(c, "done checking backpointers to extents"); + + bch_info(c, "checking extents to backpointers"); + err = "error checking extents to backpointers"; + ret = bch2_check_extents_to_backpointers(c); + if (ret) + goto err; + bch_verbose(c, "done checking extents to backpointers"); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); + + bch_info(c, "checking alloc to lru refs"); + err = "error checking alloc to lru refs"; + ret = bch2_check_alloc_to_lru_refs(c); + if (ret) + goto err; + bch_verbose(c, "done checking alloc to lru refs"); + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + } else { + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + + if (c->opts.norecovery) + goto out; + + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); + if (ret) + goto err; + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); + } - bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); - err = "journal replay failed"; - ret = bch2_journal_replay(c); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); if (ret) goto err; - if (c->opts.verbose || !c->sb.clean) - bch_info(c, "journal replay done"); if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); @@ -1259,13 +1425,19 @@ out: if (!c->opts.keep_journal) { bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(&c->journal_entries); + bch2_journal_entries_free(c); } kfree(clean); + + if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { + bch2_fs_read_write_early(c); + bch2_delete_dead_snapshots_async(c); + } + if (ret) - bch_err(c, "Error in recovery: %s (%i)", err, ret); + bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret)); else - bch_verbose(c, "ret %i", ret); + bch_verbose(c, "ret %s", bch2_err_str(ret)); return ret; err: fsck_err: @@ -1280,7 +1452,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct qstr lostfound = QSTR("lost+found"); const char *err = "cannot allocate memory"; struct bch_dev *ca; - LIST_HEAD(journal); unsigned i; int ret; @@ -1290,6 +1461,9 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + if (c->sb.version < bcachefs_metadata_version_inode_v3) + c->opts.version_upgrade = true; + if (c->opts.version_upgrade) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); @@ -1297,13 +1471,16 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); + for_each_online_member(ca, c, i) + bch2_dev_usage_init(ca); + err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); @@ -1317,7 +1494,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_fs_journal_start(&c->journal, 1, &journal); + bch2_fs_journal_start(&c->journal, 1); bch2_journal_set_replay_done(&c->journal); err = "error going read-write"; @@ -1329,6 +1506,7 @@ int bch2_fs_initialize(struct bch_fs *c) * Write out the superblock and journal buckets, now that we can do * btree updates */ + bch_verbose(c, "marking superblocks"); err = "error marking superblock and journal"; for_each_member_device(ca, c, i) { ret = bch2_trans_mark_dev_sb(c, ca); @@ -1340,6 +1518,12 @@ int bch2_fs_initialize(struct bch_fs *c) ca->new_fs_bucket_idx = 0; } + bch_verbose(c, "initializing freespace"); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + err = "error creating root snapshot node"; ret = bch2_fs_initialize_subvolumes(c); if (ret) @@ -1356,7 +1540,7 @@ int bch2_fs_initialize(struct bch_fs *c) S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - bch2_inode_pack(c, &packed_inode, &root_inode); + bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; err = "error creating root directory"; diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 21bdad9..8c0348e 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -#define for_each_journal_key(keys, i) \ - for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) - struct journal_iter { struct list_head list; enum btree_id btree_id; @@ -23,16 +20,14 @@ struct btree_and_journal_iter { struct bkey unpacked; struct journal_iter journal; - - enum last_key_returned { - none, - btree, - journal, - } last; + struct bpos pos; + bool at_end; }; -size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, - unsigned, struct bpos); +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); @@ -45,7 +40,6 @@ void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, @@ -56,7 +50,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_free(struct journal_keys *); -void bch2_journal_entries_free(struct list_head *); +void bch2_journal_entries_free(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index c8d6d73..d5c14bb 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -25,18 +25,25 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - if (bkey_val_bytes(p.k) != sizeof(*p.v)) - return "incorrect value size"; + if (bkey_val_bytes(p.k) != sizeof(*p.v)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(p.k), sizeof(*p.v)); + return -EINVAL; + } if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && - le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) - return "idx < front_pad"; + le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { + prt_printf(err, "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + return -EINVAL; + } - return NULL; + return 0; } void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, @@ -44,7 +51,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - pr_buf(out, "idx %llu front_pad %u back_pad %u", + prt_printf(out, "idx %llu front_pad %u back_pad %u", le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); @@ -70,14 +77,18 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ -const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - if (bkey_val_bytes(r.k) < sizeof(*r.v)) - return "incorrect value size"; + if (bkey_val_bytes(r.k) < sizeof(*r.v)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(r.k), sizeof(*r.v)); + return -EINVAL; + } - return bch2_bkey_ptrs_invalid(c, k); + return bch2_bkey_ptrs_invalid(c, k, rw, err); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -85,7 +96,7 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); bch2_bkey_ptrs_to_text(out, c, k); } @@ -98,14 +109,37 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); } +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + return 0; + } + } + + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); +} + /* indirect inline data */ -const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, - struct bkey_s_c k) +int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) - return "incorrect value size"; - return NULL; + if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data)); + return -EINVAL; + } + + return 0; } void bch2_indirect_inline_data_to_text(struct printbuf *out, @@ -114,11 +148,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); unsigned datalen = bkey_inline_data_bytes(k.k); - pr_buf(out, "refcount %llu datalen %u: %*phN", + prt_printf(out, "refcount %llu datalen %u: %*phN", le64_to_cpu(d.v->refcount), datalen, min(datalen, 32U), d.v->data); } +int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_indirect_inline_data *r = + bkey_i_to_indirect_inline_data(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + } + } + + return 0; +} + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *orig) @@ -229,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c, u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return -EROFS; bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -246,7 +299,8 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, BTREE_ITER_INTENT); - while ((ret == 0 || ret == -EINTR) && + while ((ret == 0 || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) && bkey_cmp(dst_iter.pos, dst_end) < 0) { struct disk_reservation disk_res = { 0 }; @@ -356,7 +410,7 @@ s64 bch2_remap_range(struct bch_fs *c, } bch2_trans_iter_exit(&trans, &inode_iter); - } while (ret2 == -EINTR); + } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&new_src, c); diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 3745873..f9848dc 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -2,7 +2,8 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -10,27 +11,39 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ .val_to_text = bch2_reflink_p_to_text, \ - .key_merge = bch2_reflink_p_merge, \ + .key_merge = bch2_reflink_p_merge, \ + .trans_trigger = bch2_trans_mark_reflink_p, \ + .atomic_trigger = bch2_mark_reflink_p, \ } -const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ } -const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, - struct bkey_s_c); +int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_indirect_inline_data(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, + unsigned); #define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ + .trans_trigger = bch2_trans_mark_indirect_inline_data, \ } static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 96994b7..fcf73d7 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -36,20 +36,36 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } +void bch2_replicas_entry_v0_to_text(struct printbuf *out, + struct bch_replicas_entry_v0 *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + prt_printf(out, "%s", bch2_data_types[e->data_type]); + else + prt_printf(out, "(invalid data type %u)", e->data_type); + + prt_printf(out, ": %u [", e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); +} + void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry *e) { unsigned i; if (e->data_type < BCH_DATA_NR) - pr_buf(out, "%s", bch2_data_types[e->data_type]); + prt_printf(out, "%s", bch2_data_types[e->data_type]); else - pr_buf(out, "(invalid data type %u)", e->data_type); + prt_printf(out, "(invalid data type %u)", e->data_type); - pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs); + prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); for (i = 0; i < e->nr_devs; i++) - pr_buf(out, i ? " %u" : "%u", e->devs[i]); - pr_buf(out, "]"); + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -60,7 +76,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, for_each_cpu_replicas_entry(r, e) { if (!first) - pr_buf(out, " "); + prt_printf(out, " "); first = false; bch2_replicas_entry_to_text(out, e); @@ -462,7 +478,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { n = cpu_replicas_add_entry(&c->replicas_gc, e); if (!n.entries) { - ret = -ENOSPC; + ret = -ENOMEM; goto err; } @@ -471,10 +487,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) } } - if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { - ret = -ENOSPC; + ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (ret) goto err; - } ret = replicas_table_update(c, &c->replicas_gc); err: @@ -577,10 +592,9 @@ retry: bch2_cpu_replicas_sort(&new); - if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { - ret = -ENOSPC; + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + if (ret) goto err; - } ret = replicas_table_update(c, &new); err: @@ -735,7 +749,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); @@ -780,7 +794,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_resize_replicas(&c->disk_sb, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_get_replicas(c->disk_sb.sb); @@ -818,27 +832,27 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, cpu_replicas_entry(cpu_r, i); if (e->data_type >= BCH_DATA_NR) { - pr_buf(err, "invalid data type in entry "); + prt_printf(err, "invalid data type in entry "); bch2_replicas_entry_to_text(err, e); return -EINVAL; } if (!e->nr_devs) { - pr_buf(err, "no devices in entry "); + prt_printf(err, "no devices in entry "); bch2_replicas_entry_to_text(err, e); return -EINVAL; } if (e->nr_required > 1 && e->nr_required >= e->nr_devs) { - pr_buf(err, "bad nr_required in entry "); + prt_printf(err, "bad nr_required in entry "); bch2_replicas_entry_to_text(err, e); return -EINVAL; } for (j = 0; j < e->nr_devs; j++) if (!bch2_dev_exists(sb, mi, e->devs[j])) { - pr_buf(err, "invalid device %u in entry ", e->devs[j]); + prt_printf(err, "invalid device %u in entry ", e->devs[j]); bch2_replicas_entry_to_text(err, e); return -EINVAL; } @@ -850,7 +864,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); if (!memcmp(e, n, cpu_r->entry_size)) { - pr_buf(err, "duplicate replicas entry "); + prt_printf(err, "duplicate replicas entry "); bch2_replicas_entry_to_text(err, e); return -EINVAL; } @@ -860,7 +874,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, return 0; } -static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f, +static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); @@ -885,19 +899,20 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, for_each_replicas_entry(r, e) { if (!first) - pr_buf(out, " "); + prt_printf(out, " "); first = false; bch2_replicas_entry_to_text(out, e); } + prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_validate_replicas, + .validate = bch2_sb_replicas_validate, .to_text = bch2_sb_replicas_to_text, }; -static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, +static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); @@ -912,8 +927,27 @@ static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field * return ret; } +static void bch2_sb_replicas_v0_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_entry_v0 *e; + bool first = true; + + for_each_replicas_entry(sb_r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_v0_to_text(out, e); + } + prt_newline(out); +} + const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { - .validate = bch2_sb_validate_replicas_v0, + .validate = bch2_sb_replicas_v0_validate, + .to_text = bch2_sb_replicas_v0_to_text, }; /* Query replicas: */ @@ -954,11 +988,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (dflags & ~flags) { if (print) { - char buf[100]; + struct printbuf buf = PRINTBUF; - bch2_replicas_entry_to_text(&PBUF(buf), e); + bch2_replicas_entry_to_text(&buf, e); bch_err(c, "insufficient devices online (%u) for replicas entry %s", - nr_online, buf); + nr_online, buf.buf); + printbuf_exit(&buf); } ret = false; break; @@ -970,19 +1005,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return ret; } -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { - struct bch_replicas_entry *e; - unsigned i, ret = 0; + struct bch_sb_field_replicas *replicas; + struct bch_sb_field_replicas_v0 *replicas_v0; + unsigned i, data_has = 0; + + replicas = bch2_sb_get_replicas(sb); + replicas_v0 = bch2_sb_get_replicas_v0(sb); + + if (replicas) { + struct bch_replicas_entry *r; + + for_each_replicas_entry(replicas, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } else if (replicas_v0) { + struct bch_replicas_entry_v0 *r; + + for_each_replicas_entry_v0(replicas_v0, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } - percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) - for (i = 0; i < e->nr_devs; i++) - if (e->devs[i] == ca->dev_idx) - ret |= 1 << e->data_type; + return data_has; +} - percpu_up_read(&c->mark_lock); +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned ret; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&c->sb_lock); return ret; } diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index d237d7c..cc34b38 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_REPLICAS_H #define _BCACHEFS_REPLICAS_H +#include "bkey.h" #include "eytzinger.h" #include "replicas_types.h" @@ -64,6 +65,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, unsigned, bool); +unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); diff --git a/libbcachefs/siphash.c b/libbcachefs/siphash.c index c062edb..dc1a27c 100644 --- a/libbcachefs/siphash.c +++ b/libbcachefs/siphash.c @@ -160,7 +160,7 @@ u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); memset(ctx, 0, sizeof(*ctx)); - return (r); + return r; } u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 57d6367..6178ae6 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -144,7 +144,9 @@ struct bch_hash_desc { static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) { return k.k->type == desc.key_type && - (!desc.is_visible || desc.is_visible(inum, k)); + (!desc.is_visible || + !inum.inum || + desc.is_visible(inum, k)); } static __always_inline int @@ -163,12 +165,10 @@ bch2_hash_lookup(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inum.inum) - break; - if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) return 0; @@ -199,18 +199,15 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inum.inum) - break; - + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) if (!is_visible_key(desc, inum, k)) return 0; - } bch2_trans_iter_exit(trans, iter); - return ret ?: -ENOSPC; + return ret ?: -BCH_ERR_ENOSPC_str_hash_create; } static __always_inline @@ -244,30 +241,25 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, - struct bkey_i *insert, int flags) +int bch2_hash_set_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + int flags, + int update_flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - - for_each_btree_key_norestart(trans, iter, desc.btree_id, - SPOS(inum.inum, + for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), + POS(insert->k.p.inode, U64_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter.pos.inode != inum.inum) - break; - if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -285,7 +277,7 @@ int bch2_hash_set(struct btree_trans *trans, } if (!ret) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_str_hash_create; out: bch2_trans_iter_exit(trans, &slot); bch2_trans_iter_exit(trans, &iter); @@ -310,6 +302,26 @@ not_found: goto out; } +static __always_inline +int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, + struct bkey_i *insert, int flags) +{ + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + insert->k.p.inode = inum.inum; + + return bch2_hash_set_snapshot(trans, desc, info, inum, + snapshot, insert, flags, 0); +} + static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 6960332..8c98bac 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -3,21 +3,19 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "errcode.h" #include "error.h" #include "fs.h" #include "subvolume.h" /* Snapshot tree: */ -static void bch2_delete_dead_snapshots_work(struct work_struct *); -static void bch2_delete_dead_snapshots(struct bch_fs *); - void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", + prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u", BCH_SNAPSHOT_SUBVOL(s.v), BCH_SNAPSHOT_DELETED(s.v), le32_to_cpu(s.v->parent), @@ -26,39 +24,55 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->subvol)); } -const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_snapshot s; u32 i, id; if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || - bkey_cmp(k.k->p, POS(0, 1)) < 0) - return "bad pos"; + bkey_cmp(k.k->p, POS(0, 1)) < 0) { + prt_printf(err, "bad pos"); + return -EINVAL; + } - if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) - return "bad val size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) { + prt_printf(err, "bad val size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_snapshot)); + return -EINVAL; + } s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - if (id && id <= k.k->p.offset) - return "bad parent node"; + if (id && id <= k.k->p.offset) { + prt_printf(err, "bad parent node (%u <= %llu)", + id, k.k->p.offset); + return -EINVAL; + } - if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) - return "children not normalized"; + if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { + prt_printf(err, "children not normalized"); + return -EINVAL; + } if (s.v->children[0] && - s.v->children[0] == s.v->children[1]) - return "duplicate child nodes"; + s.v->children[0] == s.v->children[1]) { + prt_printf(err, "duplicate child nodes"); + return -EINVAL; + } for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - if (id >= k.k->p.offset) - return "bad child node"; + if (id >= k.k->p.offset) { + prt_printf(err, "bad child node (%u >= %llu)", + id, k.k->p.offset); + return -EINVAL; + } } - return NULL; + return 0; } int bch2_mark_snapshot(struct btree_trans *trans, @@ -118,7 +132,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id) if (!id) return 0; - ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + ret = snapshot_lookup(trans, id, &v); if (ret == -ENOENT) bch_err(trans->c, "snapshot node %u not found", id); if (ret) @@ -127,156 +141,206 @@ static int snapshot_live(struct btree_trans *trans, u32 id) return !BCH_SNAPSHOT_DELETED(&v); } -static int bch2_snapshots_set_equiv(struct btree_trans *trans) +static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; + unsigned i, nr_live = 0, live_idx = 0; struct bkey_s_c_snapshot snap; - unsigned i; - int ret; + u32 id = k.k->p.offset, child[2]; - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - u32 id = k.k->p.offset, child[2]; - unsigned nr_live = 0, live_idx; + if (k.k->type != KEY_TYPE_snapshot) + return 0; - if (k.k->type != KEY_TYPE_snapshot) - continue; + snap = bkey_s_c_to_snapshot(k); - snap = bkey_s_c_to_snapshot(k); - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); - for (i = 0; i < 2; i++) { - ret = snapshot_live(trans, child[i]); - if (ret < 0) - break; - - if (ret) - live_idx = i; - nr_live += ret; - } + for (i = 0; i < 2; i++) { + int ret = snapshot_live(trans, child[i]); + if (ret < 0) + return ret; - snapshot_t(c, id)->equiv = nr_live == 1 - ? snapshot_t(c, child[live_idx])->equiv - : id; + if (ret) + live_idx = i; + nr_live += ret; } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - bch_err(c, "error walking snapshots: %i", ret); - return ret; + snapshot_t(c, id)->equiv = nr_live == 1 + ? snapshot_t(c, child[live_idx])->equiv + : id; + return 0; } /* fsck: */ -static int bch2_snapshot_check(struct btree_trans *trans, - struct bkey_s_c_snapshot s) +static int check_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot s; struct bch_subvolume subvol; struct bch_snapshot v; + struct printbuf buf = PRINTBUF; + bool should_have_subvol; u32 i, id; - int ret; - - id = le32_to_cpu(s.v->subvol); - ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol)); - if (ret == -ENOENT) - bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", - s.k->p.offset, id); - if (ret) - return ret; + int ret = 0; - if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { - bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - s.k->p.offset); - return -EINVAL; - } + if (k.k->type != KEY_TYPE_snapshot) + return 0; + s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); if (id) { - ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + ret = snapshot_lookup(trans, id, &v); if (ret == -ENOENT) - bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", - s.k->p.offset, id); + bch_err(c, "snapshot with nonexistent parent:\n %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); if (ret) - return ret; + goto err; if (le32_to_cpu(v.children[0]) != s.k->p.offset && le32_to_cpu(v.children[1]) != s.k->p.offset) { - bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", + bch_err(c, "snapshot parent %u missing pointer to child %llu", id, s.k->p.offset); - return -EINVAL; + ret = -EINVAL; + goto err; } } for (i = 0; i < 2 && s.v->children[i]; i++) { id = le32_to_cpu(s.v->children[i]); - ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + ret = snapshot_lookup(trans, id, &v); if (ret == -ENOENT) - bch_err(trans->c, "snapshot node %llu has nonexistent child %u", + bch_err(c, "snapshot node %llu has nonexistent child %u", s.k->p.offset, id); if (ret) - return ret; + goto err; if (le32_to_cpu(v.parent) != s.k->p.offset) { - bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", + bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", id, le32_to_cpu(v.parent), s.k->p.offset); - return -EINVAL; + ret = -EINVAL; + goto err; } } - return 0; + should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && + !BCH_SNAPSHOT_DELETED(s.v); + + if (should_have_subvol) { + id = le32_to_cpu(s.v->subvol); + ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + if (ret == -ENOENT) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + if (ret) + goto err; + + if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + s.k->p.offset); + ret = -EINVAL; + goto err; + } + } else { + if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); + + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + bkey_reassemble(&u->k_i, s.s_c); + u->v.subvol = 0; + ret = bch2_trans_update(trans, iter, &u->k_i, 0); + if (ret) + goto err; + } + } + + if (BCH_SNAPSHOT_DELETED(s.v)) + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); +err: +fsck_err: + printbuf_exit(&buf); + return ret; } -int bch2_fs_snapshots_check(struct bch_fs *c) +int bch2_fs_check_snapshots(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct bch_snapshot s; - unsigned id; int ret; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot(&trans, &iter, k)); + + if (ret) + bch_err(c, "error %i checking snapshots", ret); + + bch2_trans_exit(&trans); + return ret; +} - ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume subvol; + struct bch_snapshot snapshot; + unsigned snapid; + int ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + ret = snapshot_lookup(trans, snapid, &snapshot); + + if (ret == -ENOENT) + bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, snapid); + if (ret) + return ret; + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "error deleting subvolume %llu: %s", + iter->pos.offset, bch2_err_str(ret)); if (ret) - break; + return ret; } - bch2_trans_iter_exit(&trans, &iter); - if (ret) { - bch_err(c, "error %i checking snapshots", ret); - goto err; - } + return 0; +} + +int bch2_fs_check_subvols(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_subvol(&trans, &iter, k)); - for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, - POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; -again_2: - id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); - ret = snapshot_lookup(&trans, id, &s); - - if (ret == -EINTR) { - k = bch2_btree_iter_peek(&iter); - goto again_2; - } else if (ret == -ENOENT) - bch_err(c, "subvolume %llu points to nonexistent snapshot %u", - k.k->p.offset, id); - else if (ret) - break; - } - bch2_trans_iter_exit(&trans, &iter); -err: bch2_trans_exit(&trans); + return ret; } @@ -290,49 +354,19 @@ int bch2_fs_snapshots_start(struct bch_fs *c) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - bool have_deleted = false; int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) - break; - - if (k.k->type != KEY_TYPE_snapshot) { - bch_err(c, "found wrong key type %u in snapshot node table", - k.k->type); - continue; - } - - if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) - have_deleted = true; - - ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0); - if (ret) - break; - } - bch2_trans_iter_exit(&trans, &iter); + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(&trans, k)); - if (ret) - goto err; - - ret = bch2_snapshots_set_equiv(&trans); - if (ret) - goto err; -err: bch2_trans_exit(&trans); - if (!ret && have_deleted) { - bch_info(c, "restarting deletion of dead snapshots"); - if (c->opts.fsck) { - bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); - } else { - bch2_delete_dead_snapshots(c); - } - } - + if (ret) + bch_err(c, "error starting snapshots: %s", bch2_err_str(ret)); return ret; } @@ -369,8 +403,10 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) goto err; bkey_reassemble(&s->k_i, k); - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); if (ret) goto err; @@ -481,7 +517,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, goto err; if (!k.k || !k.k->p.offset) { - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_snapshot_create; goto err; } @@ -534,6 +570,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, n->v.children[0] = cpu_to_le32(new_snapids[0]); n->v.children[1] = cpu_to_le32(new_snapids[1]); + n->v.subvol = 0; SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); ret = bch2_trans_update(trans, &iter, &n->k_i, 0); if (ret) @@ -544,141 +581,100 @@ err: return ret; } -static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +static int snapshot_delete_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *deleted, + snapshot_id_list *equiv_seen, + struct bpos *last_pos) { - BUG_ON(snapshot_list_has_id(s, id)); - - if (s->nr == s->size) { - size_t new_size = max(8U, s->size * 2); - void *n = krealloc(s->d, - new_size * sizeof(s->d[0]), - GFP_KERNEL); - if (!n) { - pr_err("error allocating snapshot ID list"); - return -ENOMEM; - } + struct bch_fs *c = trans->c; + u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; - s->d = n; - s->size = new_size; - }; + if (bkey_cmp(k.k->p, *last_pos)) + equiv_seen->nr = 0; + *last_pos = k.k->p; - s->d[s->nr++] = id; - return 0; + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(equiv_seen, equiv)) { + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } else { + return snapshot_list_add(c, equiv_seen, equiv); + } } -static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, - struct snapshot_id_list *deleted, - enum btree_id btree_id) +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) { - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct snapshot_id_list equiv_seen = { 0 }; - struct bpos last_pos = POS_MIN; - int ret = 0; + struct bkey_s_c_snapshot snap; + u32 children[2]; + int ret; - /* - * XXX: We should also delete whiteouts that no longer overwrite - * anything - */ + if (k.k->type != KEY_TYPE_snapshot) + return 0; - bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - - while ((bch2_trans_begin(trans), - (k = bch2_btree_iter_peek(&iter)).k) && - !(ret = bkey_err(k))) { - u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; - - if (bkey_cmp(k.k->p, last_pos)) - equiv_seen.nr = 0; - last_pos = k.k->p; - - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(&equiv_seen, equiv)) { - if (btree_id == BTREE_ID_inodes && - bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) - continue; - - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(trans, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); - if (ret) - break; - } else { - ret = snapshot_id_add(&equiv_seen, equiv); - if (ret) - break; - } + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + return 0; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); - kfree(equiv_seen.d); + ret = snapshot_live(trans, children[0]) ?: + snapshot_live(trans, children[1]); + if (ret < 0) + return ret; - return ret; + if (!ret) + return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + return 0; } -static void bch2_delete_dead_snapshots_work(struct work_struct *work) +int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_snapshot snap; - struct snapshot_id_list deleted = { 0 }; - u32 i, id, children[2]; + snapshot_id_list deleted = { 0 }; + u32 i, id; int ret = 0; + if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) + return 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); + return ret; + } + } + bch2_trans_init(&trans, c, 0, 0); /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) - continue; - - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); - - ret = snapshot_live(&trans, children[0]) ?: - snapshot_live(&trans, children[1]); - if (ret < 0) - break; - if (ret) - continue; - - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); - if (ret) { - bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, + bch2_delete_redundant_snapshot(&trans, &iter, k)); if (ret) { - bch_err(c, "error walking snapshots: %i", ret); + bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); goto err; } - ret = bch2_snapshots_set_equiv(&trans); - if (ret) + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(&trans, k)); + if (ret) { + bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); goto err; + } for_each_btree_key(&trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { @@ -687,7 +683,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) snap = bkey_s_c_to_snapshot(k); if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_id_add(&deleted, k.k->p.offset); + ret = snapshot_list_add(c, &deleted, k.k->p.offset); if (ret) break; } @@ -695,39 +691,59 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) bch2_trans_iter_exit(&trans, &iter); if (ret) { - bch_err(c, "error walking snapshots: %i", ret); + bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); goto err; } for (id = 0; id < BTREE_ID_NR; id++) { + struct bpos last_pos = POS_MIN; + snapshot_id_list equiv_seen = { 0 }; + if (!btree_type_has_snapshots(id)) continue; - ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); + ret = for_each_btree_key_commit(&trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); + + darray_exit(&equiv_seen); + if (ret) { - bch_err(c, "error deleting snapshot keys: %i", ret); + bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); goto err; } } for (i = 0; i < deleted.nr; i++) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, deleted.d[i])); + ret = commit_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.data[i])); if (ret) { - bch_err(c, "error deleting snapshot %u: %i", - deleted.d[i], ret); + bch_err(c, "error deleting snapshot %u: %s", + deleted.data[i], bch2_err_str(ret)); goto err; } } + + clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); err: - kfree(deleted.d); + darray_exit(&deleted); bch2_trans_exit(&trans); + return ret; +} + +static void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + + bch2_delete_dead_snapshots(c); percpu_ref_put(&c->writes); } -static void bch2_delete_dead_snapshots(struct bch_fs *c) +void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (!percpu_ref_tryget_live(&c->writes)) return; if (!queue_work(system_long_wq, &c->snapshot_delete_work)) @@ -737,24 +753,35 @@ static void bch2_delete_dead_snapshots(struct bch_fs *c) static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, struct btree_trans_commit_hook *h) { - bch2_delete_dead_snapshots(trans->c); + struct bch_fs *c = trans->c; + + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + + if (!test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return 0; + + bch2_delete_dead_snapshots_async(c); return 0; } /* Subvolumes: */ -const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) - return "invalid pos"; - - if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) - return "invalid pos"; + if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 || + bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) { + prt_printf(err, "invalid pos"); + return -EINVAL; + } - if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) - return "bad val size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_subvolume)); + return -EINVAL; + } - return NULL; + return 0; } void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, @@ -762,7 +789,7 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - pr_buf(out, "root %llu snapshot id %u", + prt_printf(out, "root %llu snapshot id %u", le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); } @@ -824,7 +851,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) struct bkey_s_c k; struct bkey_s_c_subvolume subvol; struct btree_trans_commit_hook *h; - struct bkey_i *delete; u32 snapid; int ret = 0; @@ -846,19 +872,14 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) subvol = bkey_s_c_to_subvolume(k); snapid = le32_to_cpu(subvol.v->snapshot); - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - ret = PTR_ERR_OR_ZERO(delete); + ret = bch2_btree_delete_at(trans, &iter, 0); if (ret) goto err; - bkey_init(&delete->k); - delete->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, delete, 0); + ret = bch2_snapshot_node_set_deleted(trans, snapid); if (ret) goto err; - ret = bch2_snapshot_node_set_deleted(trans, snapid); - h = bch2_trans_kmalloc(trans, sizeof(*h)); ret = PTR_ERR_OR_ZERO(h); if (ret) @@ -875,14 +896,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - struct snapshot_id_list s; + snapshot_id_list s; u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); s = c->snapshots_unlinked; - memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); if (!s.nr) @@ -890,16 +911,16 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) bch2_evict_subvolume_inodes(c, &s); - for (id = s.d; id < s.d + s.nr; id++) { + for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_subvolume_delete(&trans, *id)); if (ret) { - bch_err(c, "error %i deleting subvolume %u", ret, *id); + bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); break; } } - kfree(s.d); + darray_exit(&s); } percpu_ref_put(&c->writes); @@ -919,13 +940,13 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, mutex_lock(&c->snapshots_unlinked_lock); if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) - ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); + ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); mutex_unlock(&c->snapshots_unlinked_lock); if (ret) return ret; - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (unlikely(!percpu_ref_tryget_live(&c->writes))) return -EROFS; if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) @@ -1010,7 +1031,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, } if (!ret) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_subvolume_create; goto err; found_slot: snapshot_subvols[0] = dst_iter.pos.offset; diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 4abe53d..02a6366 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -2,10 +2,12 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H +#include "darray.h" #include "subvolume_types.h" void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, + int rw, struct printbuf *); #define bch2_bkey_ops_snapshot (struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ @@ -25,6 +27,16 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) return snapshot_t(c, id)->parent; } +static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->equiv; +} + +static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) +{ + return id == snapshot_t(c, id)->equiv; +} + static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) { struct snapshot_t *s = snapshot_t(c, id); @@ -56,59 +68,45 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances return id == ancestor; } -struct snapshots_seen { - struct bpos pos; - size_t nr; - size_t size; - u32 *d; -}; - -static inline void snapshots_seen_exit(struct snapshots_seen *s) +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - kfree(s->d); - s->d = NULL; -} + u32 *i; -static inline void snapshots_seen_init(struct snapshots_seen *s) -{ - memset(s, 0, sizeof(*s)); + darray_for_each(*s, i) + if (*i == id) + return true; + return false; } -static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) { - if (s->nr == s->size) { - size_t new_size = max(s->size, (size_t) 128) * 2; - u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "error reallocating snapshots_seen table (new size %zu)", - new_size); - return -ENOMEM; - } - - s->size = new_size; - s->d = d; - } + u32 *i; - s->d[s->nr++] = id; - return 0; + darray_for_each(*s, i) + if (bch2_snapshot_is_ancestor(c, id, *i)) + return true; + return false; } -static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - unsigned i; + int ret; - for (i = 0; i < s->nr; i++) - if (id == s->d[i]) - return true; - return false; + BUG_ON(snapshot_list_has_id(s, id)); + ret = darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; } -int bch2_fs_snapshots_check(struct bch_fs *); +int bch2_fs_check_snapshots(struct bch_fs *); +int bch2_fs_check_subvols(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); int bch2_fs_snapshots_start(struct bch_fs *); -const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, + int rw, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_subvolume (struct bkey_ops) { \ @@ -126,6 +124,9 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_snapshot_node_create(struct btree_trans *, u32, u32 *, u32 *, unsigned); +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); + int bch2_subvolume_delete(struct btree_trans *, u32); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index 9410b95..f7562b5 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -2,10 +2,8 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -struct snapshot_id_list { - u32 nr; - u32 size; - u32 *d; -}; +#include "darray.h" + +typedef DARRAY(u32) snapshot_id_list; #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 49dafda..60c1f03 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -10,16 +10,21 @@ #include "io.h" #include "journal.h" #include "journal_io.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" #include "super-io.h" #include "super.h" #include "vstructs.h" +#include "counters.h" #include +#include #include +#include + const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -95,8 +100,7 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { - if (sb->bio) - bio_put(sb->bio); + kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->bdev)) blkdev_put(sb->bdev, sb->mode); @@ -123,11 +127,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { - char buf[BDEVNAME_SIZE]; - - pr_err("%s: superblock too big: want %zu but have %llu", - bdevname(sb->bdev, buf), new_bytes, max_bytes); - return -ENOSPC; + pr_err("%pg: superblock too big: want %zu but have %llu", + sb->bdev, new_bytes, max_bytes); + return -BCH_ERR_ENOSPC_sb; } } @@ -138,13 +140,15 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) return -ENOMEM; if (sb->have_bio) { - bio = bio_kmalloc(GFP_KERNEL, - DIV_ROUND_UP(new_buffer_size, PAGE_SIZE)); + unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) return -ENOMEM; - if (sb->bio) - bio_put(sb->bio); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + + kfree(sb->bio); sb->bio = bio; } @@ -208,23 +212,23 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out unsigned i; if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { - pr_buf(out, "Not a bcachefs superblock layout"); + prt_printf(out, "Not a bcachefs superblock layout"); return -EINVAL; } if (layout->layout_type != 0) { - pr_buf(out, "Invalid superblock layout type %u", + prt_printf(out, "Invalid superblock layout type %u", layout->layout_type); return -EINVAL; } if (!layout->nr_superblocks) { - pr_buf(out, "Invalid superblock layout: no superblocks"); + prt_printf(out, "Invalid superblock layout: no superblocks"); return -EINVAL; } if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { - pr_buf(out, "Invalid superblock layout: too many superblocks"); + prt_printf(out, "Invalid superblock layout: too many superblocks"); return -EINVAL; } @@ -236,7 +240,7 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out offset = le64_to_cpu(layout->sb_offset[i]); if (offset < prev_offset + max_sectors) { - pr_buf(out, "Invalid superblock layout: superblocks overlap\n" + prt_printf(out, "Invalid superblock layout: superblocks overlap\n" " (sb %u ends at %llu next starts at %llu", i - 1, prev_offset + max_sectors, offset); return -EINVAL; @@ -247,82 +251,111 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, + int rw) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *mi; + enum bch_opt_id opt_id; u32 version, version_min; u16 block_size; int ret; version = le16_to_cpu(sb->version); - version_min = version >= bcachefs_metadata_version_new_versioning + version_min = version >= bcachefs_metadata_version_bkey_renumber ? le16_to_cpu(sb->version_min) : version; if (version >= bcachefs_metadata_version_max) { - pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); return -EINVAL; } if (version_min < bcachefs_metadata_version_min) { - pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); return -EINVAL; } if (version_min > version) { - pr_buf(out, "Bad minimum version %u, greater than version field %u", + prt_printf(out, "Bad minimum version %u, greater than version field %u", version_min, version); return -EINVAL; } if (sb->features[1] || (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - pr_buf(out, "Filesystem has incompatible features"); + prt_printf(out, "Filesystem has incompatible features"); return -EINVAL; } block_size = le16_to_cpu(sb->block_size); if (block_size > PAGE_SECTORS) { - pr_buf(out, "Block size too big (got %u, max %u)", + prt_printf(out, "Block size too big (got %u, max %u)", block_size, PAGE_SECTORS); return -EINVAL; } if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { - pr_buf(out, "Bad user UUID (got zeroes)"); + prt_printf(out, "Bad user UUID (got zeroes)"); return -EINVAL; } if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { - pr_buf(out, "Bad intenal UUID (got zeroes)"); + prt_printf(out, "Bad intenal UUID (got zeroes)"); return -EINVAL; } if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { - pr_buf(out, "Bad number of member devices %u (max %u)", + prt_printf(out, "Bad number of member devices %u (max %u)", sb->nr_devices, BCH_SB_MEMBERS_MAX); return -EINVAL; } if (sb->dev_idx >= sb->nr_devices) { - pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)", + prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", sb->dev_idx, sb->nr_devices); return -EINVAL; } if (!sb->time_precision || le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { - pr_buf(out, "Invalid time precision: %u (min 1, max %lu)", + prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", le32_to_cpu(sb->time_precision), NSEC_PER_SEC); return -EINVAL; } + if (rw == READ) { + /* + * Been seeing a bug where these are getting inexplicably + * zeroed, so we'r now validating them, but we have to be + * careful not to preven people's filesystems from mounting: + */ + if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); + } + + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, opt_id); + + prt_printf(out, "Invalid option "); + ret = bch2_opt_validate(opt, v, out); + if (ret) + return ret; + + printbuf_reset(out); + } + } + /* validate layout */ ret = validate_sb_layout(&sb->layout, out); if (ret) @@ -330,13 +363,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) vstruct_for_each(sb, f) { if (!f->u64s) { - pr_buf(out, "Invalid superblock: optional with size 0 (type %u)", + prt_printf(out, "Invalid superblock: optional with size 0 (type %u)", le32_to_cpu(f->type)); return -EINVAL; } if (vstruct_next(f) > vstruct_last(sb)) { - pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", + prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", le32_to_cpu(f->type)); return -EINVAL; } @@ -345,7 +378,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) /* members must be validated first: */ mi = bch2_sb_get_members(sb); if (!mi) { - pr_buf(out, "Invalid superblock: member info area missing"); + prt_printf(out, "Invalid superblock: member info area missing"); return -EINVAL; } @@ -424,7 +457,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) memcpy(dst->compat, src->compat, sizeof(dst->compat)); for (i = 0; i < BCH_SB_FIELD_NR; i++) { - if (i == BCH_SB_FIELD_journal) + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) continue; src_f = bch2_sb_field_get(src, i); @@ -455,9 +488,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) __copy_super(&c->disk_sb, src); - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) - set_bit(BCH_FS_INITIALIZED, &c->flags); - ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) return ret; @@ -498,36 +528,34 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf size_t bytes; int ret; reread: - bio_reset(sb->bio); - bio_set_dev(sb->bio, sb->bdev); + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); sb->bio->bi_iter.bi_sector = offset; - bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); ret = submit_bio_wait(sb->bio); if (ret) { - pr_buf(err, "IO error: %i", ret); + prt_printf(err, "IO error: %i", ret); return ret; } if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { - pr_buf(err, "Not a bcachefs superblock"); + prt_printf(err, "Not a bcachefs superblock"); return -EINVAL; } version = le16_to_cpu(sb->sb->version); - version_min = version >= bcachefs_metadata_version_new_versioning + version_min = version >= bcachefs_metadata_version_bkey_renumber ? le16_to_cpu(sb->sb->version_min) : version; if (version >= bcachefs_metadata_version_max) { - pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); return -EINVAL; } if (version_min < bcachefs_metadata_version_min) { - pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); return -EINVAL; } @@ -535,7 +563,7 @@ reread: bytes = vstruct_bytes(sb->sb); if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { - pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", bytes, 512UL << sb->sb->layout.sb_max_size_bits); return -EINVAL; } @@ -547,7 +575,7 @@ reread: } if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { - pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -EINVAL; } @@ -556,7 +584,7 @@ reread: null_nonce(), sb->sb); if (bch2_crc_cmp(csum, sb->sb->csum)) { - pr_buf(err, "bad checksum"); + prt_printf(err, "bad checksum"); return -EINVAL; } @@ -570,16 +598,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts, { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; - char *_err; - struct printbuf err; + struct printbuf err = PRINTBUF; __le64 *i; int ret; - _err = kmalloc(4096, GFP_KERNEL); - if (!_err) - return -ENOMEM; - err = _PBUF(_err, 4096); - pr_verbose_init(*opts, ""); memset(sb, 0, sizeof(*sb)); @@ -610,12 +632,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts, ret = bch2_sb_realloc(sb, 0); if (ret) { - pr_buf(&err, "error allocating memory for superblock"); + prt_printf(&err, "error allocating memory for superblock"); goto err; } if (bch2_fs_init_fault("read_super")) { - pr_buf(&err, "dynamic fault"); + prt_printf(&err, "dynamic fault"); ret = -EFAULT; goto err; } @@ -628,17 +650,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts, goto err; printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", - path, _err); - err = _PBUF(_err, 4096); + path, err.buf); + printbuf_reset(&err); /* * Error reading primary superblock - read location of backup * superblocks: */ - bio_reset(sb->bio); - bio_set_dev(sb->bio, sb->bdev); + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); /* * use sb buffer to read layout, since sb buffer is page aligned but * layout won't be: @@ -647,7 +667,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, ret = submit_bio_wait(sb->bio); if (ret) { - pr_buf(&err, "IO error: %i", ret); + prt_printf(&err, "IO error: %i", ret); goto err; } @@ -673,7 +693,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, got_super: if (le16_to_cpu(sb->sb->block_size) << 9 < bdev_logical_block_size(sb->bdev)) { - pr_buf(&err, "block size (%u) smaller than device block size (%u)", + prt_printf(&err, "block size (%u) smaller than device block size (%u)", le16_to_cpu(sb->sb->block_size) << 9, bdev_logical_block_size(sb->bdev)); ret = -EINVAL; @@ -683,19 +703,19 @@ got_super: ret = 0; sb->have_layout = true; - ret = bch2_sb_validate(sb, &err); + ret = bch2_sb_validate(sb, &err, READ); if (ret) { printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", - path, _err); + path, err.buf); goto err_no_print; } out: pr_verbose_init(*opts, "ret %i", ret); - kfree(_err); + printbuf_exit(&err); return ret; err: printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", - path, _err); + path, err.buf); err_no_print: bch2_free_super(sb); goto out; @@ -722,12 +742,10 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) struct bch_sb *sb = ca->disk_sb.sb; struct bio *bio = ca->disk_sb.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); bio->bi_end_io = write_super_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], @@ -748,12 +766,10 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), null_nonce(), sb); - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); bio->bi_end_io = write_super_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); bch2_bio_map(bio, sb, roundup((size_t) vstruct_bytes(sb), bdev_logical_block_size(ca->disk_sb.bdev))); @@ -769,12 +785,15 @@ int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; struct bch_dev *ca; + struct printbuf err = PRINTBUF; unsigned i, sb = 0, nr_wrote; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; int ret = 0; + trace_and_count(c, write_super, c, _RET_IP_); + if (c->opts.very_degraded) degraded_flags |= BCH_FORCE_IF_LOST; @@ -792,22 +811,17 @@ int bch2_write_super(struct bch_fs *c) SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + bch2_sb_counters_from_cpu(c); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); for_each_online_member(ca, c, i) { - struct printbuf buf = { NULL, NULL }; + printbuf_reset(&err); - ret = bch2_sb_validate(&ca->disk_sb, &buf); + ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); if (ret) { - char *_buf = kmalloc(4096, GFP_NOFS); - if (_buf) { - buf = _PBUF(_buf, 4096); - bch2_sb_validate(&ca->disk_sb, &buf); - } - - bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf); - kfree(_buf); + bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); percpu_ref_put(&ca->io_ref); goto out; } @@ -816,6 +830,13 @@ int bch2_write_super(struct bch_fs *c) if (c->opts.nochanges) goto out; + /* + * Defer writing the superblock until filesystem initialization is + * complete - don't write out a partly initialized superblock: + */ + if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) + goto out; + for_each_online_member(ca, c, i) { __set_bit(ca->dev_idx, sb_written.d); ca->sb_write_error = 0; @@ -898,6 +919,7 @@ int bch2_write_super(struct bch_fs *c) out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + printbuf_exit(&err); return ret; } @@ -912,75 +934,9 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) mutex_unlock(&c->sb_lock); } -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - - return l < r ? -1 : l > r ? 1 : 0; -} - -static int bch2_sb_validate_journal(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - int ret = -EINVAL; - unsigned nr; - unsigned i; - u64 *b; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return 0; - - b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); - if (!b) - return -ENOMEM; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - if (!b[0]) { - pr_buf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0] < le16_to_cpu(m->first_bucket)) { - pr_buf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m->first_bucket)); - goto err; - } - - if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { - pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m->nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - pr_buf(err, "duplicate journal buckets %llu", b[i]); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_validate_journal, -}; - /* BCH_SB_FIELD_members: */ -static int bch2_sb_validate_members(struct bch_sb *sb, +static int bch2_sb_members_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { @@ -989,7 +945,7 @@ static int bch2_sb_validate_members(struct bch_sb *sb, if ((void *) (mi->members + sb->nr_devices) > vstruct_end(&mi->field)) { - pr_buf(err, "too many devices for section size"); + prt_printf(err, "too many devices for section size"); return -EINVAL; } @@ -1000,28 +956,28 @@ static int bch2_sb_validate_members(struct bch_sb *sb, continue; if (le64_to_cpu(m->nbuckets) > LONG_MAX) { - pr_buf(err, "device %u: too many buckets (got %llu, max %lu)", + prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", i, le64_to_cpu(m->nbuckets), LONG_MAX); return -EINVAL; } if (le64_to_cpu(m->nbuckets) - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { - pr_buf(err, "device %u: not enough buckets (got %llu, max %u)", + prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); return -EINVAL; } if (le16_to_cpu(m->bucket_size) < le16_to_cpu(sb->block_size)) { - pr_buf(err, "device %u: bucket size %u smaller than block size %u", + prt_printf(err, "device %u: bucket size %u smaller than block size %u", i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); return -EINVAL; } if (le16_to_cpu(m->bucket_size) < BCH_SB_BTREE_NODE_SIZE(sb)) { - pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu", + prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); return -EINVAL; } @@ -1030,39 +986,165 @@ static int bch2_sb_validate_members(struct bch_sb *sb, return 0; } +static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); + struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m->bucket_size); + u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; + + if (!bch2_member_exists(m)) + continue; + + prt_printf(out, "Device:"); + prt_tab(out); + prt_printf(out, "%u", i); + prt_newline(out); + + printbuf_indent_add(out, 2); + + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m->uuid.b); + prt_newline(out); + + prt_printf(out, "Size:"); + prt_tab(out); + prt_units_u64(out, device_size << 9); + prt_newline(out); + + prt_printf(out, "Bucket size:"); + prt_tab(out); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + + prt_printf(out, "First bucket:"); + prt_tab(out); + prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); + prt_newline(out); + + prt_printf(out, "Buckets:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); + prt_newline(out); + + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m->last_mount) + pr_time(out, le64_to_cpu(m->last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + + prt_printf(out, "State:"); + prt_tab(out); + prt_printf(out, "%s", + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + prt_newline(out); + + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(m)) { + unsigned idx = BCH_MEMBER_GROUP(m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(m)) + prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:"); + prt_tab(out); + if (data_have) + prt_bitflags(out, bch2_data_types, data_have); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Discard:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); + prt_newline(out); + + prt_printf(out, "Freespace initialized:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + prt_newline(out); + + printbuf_indent_sub(out, 2); + } +} + static const struct bch_sb_field_ops bch_sb_field_ops_members = { - .validate = bch2_sb_validate_members, + .validate = bch2_sb_members_validate, + .to_text = bch2_sb_members_to_text, }; /* BCH_SB_FIELD_crypt: */ -static int bch2_sb_validate_crypt(struct bch_sb *sb, +static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - pr_buf(err, "wrong size (got %llu should be %zu)", + prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&crypt->field), sizeof(*crypt)); return -EINVAL; } if (BCH_CRYPT_KDF_TYPE(crypt)) { - pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); return -EINVAL; } return 0; } +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_newline(out); + prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + prt_newline(out); + prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + prt_newline(out); + prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + prt_newline(out); +} + static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_validate_crypt, + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, }; /* BCH_SB_FIELD_clean: */ -int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { struct jset_entry *entry; int ret; @@ -1070,7 +1152,7 @@ int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, i for (entry = clean->start; entry < (struct jset_entry *) vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = bch2_journal_entry_validate(c, "superblock", entry, + ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), write); @@ -1185,7 +1267,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_dev_usage; u->dev = cpu_to_le32(dev); u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); - u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); for (i = 0; i < BCH_DATA_NR; i++) { u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); @@ -1234,7 +1315,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); @@ -1251,7 +1332,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) * this should be in the write path, and we should be validating every * superblock section: */ - ret = bch2_sb_clean_validate(c, sb_clean, WRITE); + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); if (ret) { bch_err(c, "error writing marking filesystem clean: validate error"); goto out; @@ -1262,14 +1343,14 @@ out: mutex_unlock(&c->sb_lock); } -static int bch2_sb_validate_clean(struct bch_sb *sb, +static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - pr_buf(err, "wrong size (got %llu should be %zu)", + prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&clean->field), sizeof(*clean)); return -EINVAL; } @@ -1277,8 +1358,32 @@ static int bch2_sb_validate_clean(struct bch_sb *sb, return 0; } +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); + prt_newline(out); + prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + prt_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + prt_newline(out); + } +} + static const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_validate_clean, + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, }; static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { @@ -1289,24 +1394,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { }; static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *orig_err) + struct printbuf *err) { unsigned type = le32_to_cpu(f->type); - struct printbuf err = *orig_err; + struct printbuf field_err = PRINTBUF; int ret; if (type >= BCH_SB_FIELD_NR) return 0; - pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]); - - ret = bch2_sb_field_ops[type]->validate(sb, f, &err); + ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); if (ret) { - pr_buf(&err, "\n"); - bch2_sb_field_to_text(&err, sb, f); - *orig_err = err; + prt_printf(err, "Invalid superblock section %s: %s", + bch2_sb_fields[type], + field_err.buf); + prt_newline(err); + bch2_sb_field_to_text(err, sb, f); } + printbuf_exit(&field_err); return ret; } @@ -1317,13 +1423,179 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ? bch2_sb_field_ops[type] : NULL; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + if (ops) - pr_buf(out, "%s", bch2_sb_fields[type]); + prt_printf(out, "%s", bch2_sb_fields[type]); else - pr_buf(out, "(unknown field %u)", type); + prt_printf(out, "(unknown field %u)", type); - pr_buf(out, " (size %llu):", vstruct_bytes(f)); + prt_printf(out, " (size %zu):", vstruct_bytes(f)); + prt_newline(out); - if (ops && ops->to_text) + if (ops && ops->to_text) { + printbuf_indent_add(out, 2); bch2_sb_field_ops[type]->to_text(out, sb, f); + printbuf_indent_sub(out, 2); + } +} + +void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) +{ + unsigned i; + + prt_printf(out, "Type: %u", l->layout_type); + prt_newline(out); + + prt_str(out, "Superblock max size: "); + prt_units_u64(out, 512 << l->sb_max_size_bits); + prt_newline(out); + + prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); + prt_newline(out); + + prt_str(out, "Offsets: "); + for (i = 0; i < l->nr_superblocks; i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); + } + prt_newline(out); +} + +void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bool print_layout, unsigned fields) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field *f; + u64 fields_have = 0; + unsigned nr_devices = 0; + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 44); + + mi = bch2_sb_get_members(sb); + if (mi) { + struct bch_member *m; + + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) + nr_devices += bch2_member_exists(m); + } + + prt_printf(out, "External UUID:"); + prt_tab(out); + pr_uuid(out, sb->user_uuid.b); + prt_newline(out); + + prt_printf(out, "Internal UUID:"); + prt_tab(out); + pr_uuid(out, sb->uuid.b); + prt_newline(out); + + prt_str(out, "Device index:"); + prt_tab(out); + prt_printf(out, "%u", sb->dev_idx); + prt_newline(out); + + prt_str(out, "Label:"); + prt_tab(out); + prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); + prt_newline(out); + + prt_str(out, "Version:"); + prt_tab(out); + prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); + prt_newline(out); + + prt_printf(out, "Oldest version on disk:"); + prt_tab(out); + prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); + prt_newline(out); + + prt_printf(out, "Created:"); + prt_tab(out); + if (sb->time_base_lo) + pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + prt_printf(out, "(not set)"); + prt_newline(out); + + prt_printf(out, "Sequence number:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(sb->seq)); + prt_newline(out); + + prt_printf(out, "Superblock size:"); + prt_tab(out); + prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_newline(out); + + prt_printf(out, "Clean:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); + prt_newline(out); + + prt_printf(out, "Devices:"); + prt_tab(out); + prt_printf(out, "%u", nr_devices); + prt_newline(out); + + prt_printf(out, "Sections:"); + vstruct_for_each(sb, f) + fields_have |= 1 << le32_to_cpu(f->type); + prt_tab(out); + prt_bitflags(out, bch2_sb_fields, fields_have); + prt_newline(out); + + prt_printf(out, "Features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); + prt_newline(out); + + prt_printf(out, "Compat features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "Options:"); + prt_newline(out); + printbuf_indent_add(out, 2); + { + enum bch_opt_id id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, id); + + prt_printf(out, "%s:", opt->attr.name); + prt_tab(out); + bch2_opt_to_text(out, NULL, sb, opt, v, + OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); + prt_newline(out); + } + } + } + + printbuf_indent_sub(out, 2); + + if (print_layout) { + prt_newline(out); + prt_printf(out, "layout:"); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_sb_layout_to_text(out, &sb->layout); + printbuf_indent_sub(out, 2); + } + + vstruct_for_each(sb, f) + if (fields & (1 << le32_to_cpu(f->type))) { + prt_newline(out); + bch2_sb_field_to_text(out, sb, f); + } } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 3b425be..14a25f6 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -/* BCH_SB_FIELD_journal: */ - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - /* BCH_SB_FIELD_members: */ static inline bool bch2_member_exists(struct bch_member *m) @@ -112,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .durability = BCH_MEMBER_DURABILITY(mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), }; } @@ -121,12 +113,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); -int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); +void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index b36e621..5be4c40 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -24,6 +24,7 @@ #include "debug.h" #include "disk_groups.h" #include "ec.h" +#include "errcode.h" #include "error.h" #include "fs.h" #include "fs-io.h" @@ -44,15 +45,16 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "counters.h" #include #include #include #include -#include #include #include #include +#include #include #include #include @@ -63,14 +65,26 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); #define KTYPE(type) \ -struct kobj_type type ## _ktype = { \ +static const struct attribute_group type ## _group = { \ + .attrs = type ## _files \ +}; \ + \ +static const struct attribute_group *type ## _groups[] = { \ + &type ## _group, \ + NULL \ +}; \ + \ +static const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &type ## _sysfs_ops, \ - .default_attrs = type ## _files \ + .default_groups = type ## _groups \ } static void bch2_fs_release(struct kobject *); static void bch2_dev_release(struct kobject *); +static void bch2_fs_counters_release(struct kobject *k) +{ +} static void bch2_fs_internal_release(struct kobject *k) { @@ -84,11 +98,12 @@ static void bch2_fs_time_stats_release(struct kobject *k) { } -static KTYPE(bch2_fs); -static KTYPE(bch2_fs_internal); -static KTYPE(bch2_fs_opts_dir); -static KTYPE(bch2_fs_time_stats); -static KTYPE(bch2_dev); +KTYPE(bch2_fs); +KTYPE(bch2_fs_counters); +KTYPE(bch2_fs_internal); +KTYPE(bch2_fs_opts_dir); +KTYPE(bch2_fs_time_stats); +KTYPE(bch2_dev); static struct kset *bcachefs_kset; static LIST_HEAD(bch_fs_list); @@ -188,71 +203,33 @@ static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; unsigned i, clean_passes = 0; + u64 seq = 0; bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_gc_thread_stop(c); - /* - * Flush journal before stopping allocators, because flushing journal - * blacklist entries involves allocating new btree nodes: - */ - bch2_journal_flush_all_pins(&c->journal); - - /* - * If the allocator threads didn't all start up, the btree updates to - * write out alloc info aren't going to work: - */ - if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) - goto nowrote_alloc; - bch_verbose(c, "flushing journal and stopping allocators"); - bch2_journal_flush_all_pins(&c->journal); - set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); - do { clean_passes++; - if (bch2_journal_flush_all_pins(&c->journal)) - clean_passes = 0; - - /* - * In flight interior btree updates will generate more journal - * updates and btree updates (alloc btree): - */ - if (bch2_btree_interior_updates_nr_pending(c)) { - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + if (bch2_btree_interior_updates_flush(c) || + bch2_journal_flush_all_pins(&c->journal) || + bch2_btree_flush_all_writes(c) || + seq != atomic64_read(&c->journal.seq)) { + seq = atomic64_read(&c->journal.seq); clean_passes = 0; } - flush_work(&c->btree_interior_update_work); - - if (bch2_journal_flush_all_pins(&c->journal)) - clean_passes = 0; } while (clean_passes < 2); - bch_verbose(c, "flushing journal and stopping allocators complete"); - - set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -nowrote_alloc: - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - flush_work(&c->btree_interior_update_work); - - for_each_member_device(ca, c, i) - bch2_dev_allocator_stop(ca); - clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); + bch_verbose(c, "flushing journal and stopping allocators complete"); + if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); bch2_fs_journal_stop(&c->journal); - /* - * the journal kicks off btree writes via reclaim - wait for in flight - * writes after stopping journal: - */ - bch2_btree_flush_all_writes(c); - /* * After stopping journal: */ @@ -280,10 +257,6 @@ void bch2_fs_read_only(struct bch_fs *c) /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch2_dev_allocator_stop()). */ percpu_ref_kill(&c->writes); @@ -315,7 +288,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && + test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && !c->opts.norecovery) { bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -354,26 +327,12 @@ static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; - ret = bch2_gc_thread_start(c); - if (ret) { - bch_err(c, "error starting gc thread"); - return ret; - } - - ret = bch2_copygc_start(c); - if (ret) { - bch_err(c, "error starting copygc thread"); - return ret; - } - ret = bch2_rebalance_start(c); if (ret) { bch_err(c, "error starting rebalance thread"); return ret; } - schedule_work(&c->ec_stripe_delete_work); - return 0; } @@ -406,25 +365,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for_each_rw_member(ca, c, i) { - ret = bch2_dev_allocator_start(ca); - if (ret) { - bch_err(c, "error starting allocator threads"); - percpu_ref_put(&ca->io_ref); - goto err; - } + ret = bch2_gc_thread_start(c); + if (ret) { + bch_err(c, "error starting gc thread"); + return ret; } - set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; + } - for_each_rw_member(ca, c, i) - bch2_wake_allocator(ca); + schedule_work(&c->ec_stripe_delete_work); + + bch2_do_discards(c); + bch2_do_invalidates(c); if (!early) { ret = bch2_fs_read_write_late(c); @@ -463,6 +425,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); @@ -480,7 +443,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(&c->journal_entries); + bch2_journal_entries_free(c); percpu_free_rwsem(&c->mark_lock); if (c->btree_paths_bufs) @@ -500,8 +463,8 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); - if (c->io_complete_wq ) - destroy_workqueue(c->io_complete_wq ); + if (c->io_complete_wq) + destroy_workqueue(c->io_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -547,6 +510,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); kobject_put(&c->internal); @@ -615,6 +579,7 @@ static int bch2_fs_online(struct bch_fs *c) kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { bch_err(c, "error creating sysfs objects"); @@ -643,6 +608,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_sb_field_members *mi; struct bch_fs *c; + struct printbuf name = PRINTBUF; unsigned i, iter_size; int ret = 0; @@ -663,6 +629,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); c->minor = -1; c->disk_sb.fs_sb = true; @@ -685,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); + bch2_fs_ec_init_early(c); INIT_LIST_HEAD(&c->list); @@ -698,7 +666,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_WORK(&c->journal_seq_blacklist_gc_work, bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_entries); INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_errors); @@ -719,8 +686,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->usage_lock); - sema_init(&c->io_in_flight, 64); - c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -745,7 +710,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - uuid_unparse_lower(c->sb.user_uuid.b, c->name); + pr_uuid(&name, c->sb.user_uuid.b); + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + + ret = name.allocation_failure ? -ENOMEM : 0; + if (ret) + goto err; /* Compat: */ if (sb->version <= bcachefs_metadata_version_inode_v2 && @@ -812,7 +783,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } - ret = bch2_io_clock_init(&c->io_clock[READ]) ?: + ret = bch2_fs_counters_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: @@ -820,7 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_buckets_waiting_for_journal_init(c); + bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_subvolumes_init(c) ?: bch2_fs_io_init(c) ?: bch2_fs_encryption_init(c) ?: @@ -830,9 +802,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - if (c->opts.nochanges) - set_bit(JOURNAL_NOCHANGES, &c->journal.flags); - mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -868,14 +837,11 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - char buf[512]; - struct printbuf p = PBUF(buf); + struct printbuf p = PRINTBUF; bool first = true; - strcpy(buf, "(null)"); - if (c->opts.read_only) { - pr_buf(&p, "ro"); + prt_printf(&p, "ro"); first = false; } @@ -890,12 +856,16 @@ static void print_mount_opts(struct bch_fs *c) continue; if (!first) - pr_buf(&p, ","); + prt_printf(&p, ","); first = false; - bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); + bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } - bch_info(c, "mounted with opts: %s", buf); + if (!p.pos) + prt_printf(&p, "(null)"); + + bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); + printbuf_exit(&p); } int bch2_fs_start(struct bch_fs *c) @@ -925,6 +895,12 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { + mutex_lock(&c->btree_transaction_stats[i].lock); + bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); + mutex_unlock(&c->btree_transaction_stats[i].lock); + } + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); @@ -943,20 +919,6 @@ int bch2_fs_start(struct bch_fs *c) set_bit(BCH_FS_STARTED, &c->flags); - /* - * Allocator threads don't start filling copygc reserve until after we - * set BCH_FS_STARTED - wake them now: - * - * XXX ugly hack: - * Need to set ca->allocator_state here instead of relying on the - * allocator threads to do it to avoid racing with the copygc threads - * checking it and thinking they have no alloc reserve: - */ - for_each_online_member(ca, c, i) { - ca->allocator_state = ALLOCATOR_running; - bch2_wake_allocator(ca); - } - if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { @@ -973,31 +935,10 @@ out: up_write(&c->state_lock); return ret; err: - switch (ret) { - case BCH_FSCK_ERRORS_NOT_FIXED: - bch_err(c, "filesystem contains errors: please report this to the developers"); - pr_cont("mount with -o fix_errors to repair\n"); - break; - case BCH_FSCK_REPAIR_UNIMPLEMENTED: - bch_err(c, "filesystem contains errors: please report this to the developers"); - pr_cont("repair unimplemented: inform the developers so that it can be added\n"); - break; - case BCH_FSCK_REPAIR_IMPOSSIBLE: - bch_err(c, "filesystem contains errors, but repair impossible"); - break; - case BCH_FSCK_UNKNOWN_VERSION: - bch_err(c, "unknown metadata version"); - break; - case -ENOMEM: - bch_err(c, "cannot allocate memory"); - break; - case -EIO: - bch_err(c, "IO error"); - break; - } + bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); - if (ret >= 0) - ret = -EIO; + if (ret < -BCH_ERR_START) + ret = -EINVAL; goto out; } @@ -1048,8 +989,6 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - bch2_dev_allocator_stop(ca); - cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1164,8 +1103,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; - if (opt_defined(c->opts, discard)) - ca->mi.discard = opt_get(c->opts, discard); + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || @@ -1216,12 +1155,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; - if (ca->mi.state == BCH_MEMBER_STATE_rw && - bch2_dev_allocator_start(ca)) { - bch2_dev_free(ca); - goto err; - } - bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1297,8 +1230,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) - bdevname(ca->disk_sb.bdev, c->name); - bdevname(ca->disk_sb.bdev, ca->name); + snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); + snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); rebalance_wakeup(c); return 0; @@ -1398,23 +1331,14 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { - /* - * Device going read only means the copygc reserve get smaller, so we - * don't want that happening while copygc is in progress: - */ - bch2_copygc_stop(c); - /* * The allocator thread itself allocates btree nodes, so stop it first: */ - bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); - - bch2_copygc_start(c); } -static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1422,8 +1346,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - - return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1450,7 +1372,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) - ret = __bch2_dev_read_write(c, ca); + __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1473,30 +1395,28 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) { - struct btree_trans trans; - size_t i; + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); int ret; - bch2_trans_init(&trans, c, 0, 0); - - for (i = 0; i < ca->mi.nbuckets; i++) { - ret = lockrestart_do(&trans, - bch2_btree_key_cache_flush(&trans, - BTREE_ID_alloc, POS(ca->dev_idx, i))); - if (ret) - break; - } - bch2_trans_exit(&trans); - - if (ret) { - bch_err(c, "error %i removing dev alloc info", ret); - return ret; - } + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) + bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); - return bch2_btree_delete_range(c, BTREE_ID_alloc, - POS(ca->dev_idx, 0), - POS(ca->dev_idx + 1, 0), - 0, NULL); + return ret; } int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -1522,32 +1442,23 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_dev_data_drop(c, ca->dev_idx, flags); if (ret) { - bch_err(ca, "Remove failed: error %i dropping data", ret); + bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); goto err; } - ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); + ret = bch2_dev_remove_alloc(c, ca); if (ret) { - bch_err(ca, "Remove failed: error %i flushing journal", ret); + bch_err(ca, "Remove failed, error deleting alloc info"); goto err; } - ret = bch2_dev_remove_alloc(c, ca); + ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); if (ret) { - bch_err(ca, "Remove failed, error deleting alloc info"); + bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); goto err; } - /* - * must flush all existing journal entries, they might have - * (overwritten) keys that point to the device we're removing: - */ - bch2_journal_flush_all_pins(&c->journal); - /* - * hack to ensure bch2_replicas_gc2() clears out entries to this device - */ - bch2_journal_meta(&c->journal); - ret = bch2_journal_error(&c->journal); + ret = bch2_journal_flush(&c->journal); if (ret) { bch_err(ca, "Remove failed, journal error"); goto err; @@ -1555,17 +1466,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_replicas_gc2(c); if (ret) { - bch_err(ca, "Remove failed: error %i from replicas gc", ret); + bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); goto err; } data = bch2_dev_has_data(c, ca); if (data) { - char data_has_str[100]; + struct printbuf data_has = PRINTBUF; - bch2_flags_to_text(&PBUF(data_has_str), - bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has_str); + prt_bitflags(&data_has, bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); + printbuf_exit(&data_has); ret = -EBUSY; goto err; } @@ -1614,24 +1525,26 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_sb_field_members *mi; struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; - char *_errbuf; - struct printbuf errbuf; + struct printbuf errbuf = PRINTBUF; + struct printbuf label = PRINTBUF; int ret; - _errbuf = kmalloc(4096, GFP_KERNEL); - if (!_errbuf) - return -ENOMEM; - - errbuf = _PBUF(_errbuf, 4096); - ret = bch2_read_super(path, &opts, &sb); if (ret) { - bch_err(c, "device add error: error reading super: %i", ret); + bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); goto err; } dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + if (BCH_MEMBER_GROUP(&dev_mi)) { + bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + if (label.allocation_failure) { + ret = -ENOMEM; + goto err; + } + } + err = bch2_dev_may_add(sb.sb, c); if (err) { bch_err(c, "device add error: %s", err); @@ -1646,6 +1559,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; } + bch2_dev_usage_init(ca); + ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) { bch2_dev_free(ca); @@ -1673,7 +1588,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { bch_err(c, "device add error: new device superblock too small"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; } @@ -1686,7 +1601,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto have_slot; no_slot: bch_err(c, "device add error: already have maximum number of devices"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; have_slot: @@ -1697,7 +1612,7 @@ have_slot: mi = bch2_sb_resize_members(&c->disk_sb, u64s); if (!mi) { bch_err(c, "device add error: no room in superblock for member info"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; } @@ -1710,6 +1625,14 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); + if (BCH_MEMBER_GROUP(&dev_mi)) { + ret = __bch2_dev_group_set(c, ca, label.buf); + if (ret) { + bch_err(c, "device add error: error setting label"); + goto err_unlock; + } + } + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1717,19 +1640,20 @@ have_slot: ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "device add error: error marking new superblock: %i", ret); + bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); + goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { + bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); goto err_late; } ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) { - bch_err(c, "device add error: error going RW on new device: %i", ret); - goto err_late; - } - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; @@ -1741,7 +1665,8 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - kfree(_errbuf); + printbuf_exit(&label); + printbuf_exit(&errbuf); return ret; err_late: up_write(&c->state_lock); @@ -1784,16 +1709,13 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb", - path, ret); + bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", + path, bch2_err_str(ret)); goto err; } - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) - goto err; - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); @@ -1857,14 +1779,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = bch2_dev_buckets_resize(c, ca, nbuckets); if (ret) { - bch_err(ca, "Resize error: %i", ret); + bch_err(ca, "Resize error: %s", bch2_err_str(ret)); goto err; } ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { + if (ret) goto err; - } mutex_lock(&c->sb_lock); mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; @@ -1906,8 +1827,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_sb_field_members *mi; unsigned i, best_sb = 0; const char *err; - char *_errbuf = NULL; - struct printbuf errbuf; + struct printbuf errbuf = PRINTBUF; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -1920,14 +1840,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } - _errbuf = kmalloc(4096, GFP_KERNEL); - if (!_errbuf) { - ret = -ENOMEM; - goto err; - } - - errbuf = _PBUF(_errbuf, 4096); - sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); if (!sb) { ret = -ENOMEM; @@ -1952,9 +1864,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, while (i < nr_devices) { if (i != best_sb && !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { - char buf[BDEVNAME_SIZE]; - pr_info("%s has been removed, skipping", - bdevname(sb[i].bdev, buf)); + pr_info("%pg has been removed, skipping", sb[i].bdev); bch2_free_super(&sb[i]); array_remove_item(sb, nr_devices, i); continue; @@ -1993,7 +1903,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } out: kfree(sb); - kfree(_errbuf); + printbuf_exit(&errbuf); module_put(THIS_MODULE); pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 3f24ca5..8501ada 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} + static inline bool bch2_dev_is_online(struct bch_dev *ca) { return !percpu_ref_is_zero(&ca->io_ref); @@ -83,7 +89,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { BUG_ON(bch2_dev_list_has_dev(*devs, dev)); - BUG_ON(devs->nr >= BCH_REPLICAS_MAX); + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); devs->devs[devs->nr++] = dev; } diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index d8b159a..89419fc 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -32,6 +32,7 @@ struct bch_member_cpu { u8 discard; u8 data_allowed; u8 durability; + u8 freespace_initialized; u8 valid; }; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index b727845..0f45aef 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -34,24 +34,57 @@ #include "tests.h" #include +#include #include #include #include "util.h" #define SYSFS_OPS(type) \ -struct sysfs_ops type ## _sysfs_ops = { \ +const struct sysfs_ops type ## _sysfs_ops = { \ .show = type ## _show, \ .store = type ## _store \ } #define SHOW(fn) \ +static ssize_t fn ## _to_text(struct printbuf *, \ + struct kobject *, struct attribute *); \ + \ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ char *buf) \ +{ \ + struct printbuf out = PRINTBUF; \ + ssize_t ret = fn ## _to_text(&out, kobj, attr); \ + \ + if (out.pos && out.buf[out.pos - 1] != '\n') \ + prt_newline(&out); \ + \ + if (!ret && out.allocation_failure) \ + ret = -ENOMEM; \ + \ + if (!ret) { \ + ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ + memcpy(buf, out.buf, ret); \ + } \ + printbuf_exit(&out); \ + return bch2_err_class(ret); \ +} \ + \ +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ + struct attribute *attr) #define STORE(fn) \ +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ + const char *, size_t); \ + \ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ const char *buf, size_t size) \ +{ \ + return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ +} \ + \ +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) #define __sysfs_attribute(_name, _mode) \ static struct attribute sysfs_##_name = \ @@ -64,22 +97,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ #define sysfs_printf(file, fmt, ...) \ do { \ if (attr == &sysfs_ ## file) \ - return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ + prt_printf(out, fmt "\n", __VA_ARGS__); \ } while (0) #define sysfs_print(file, var) \ do { \ if (attr == &sysfs_ ## file) \ - return snprint(buf, PAGE_SIZE, var); \ + snprint(out, var); \ } while (0) #define sysfs_hprint(file, val) \ do { \ - if (attr == &sysfs_ ## file) { \ - bch2_hprint(&out, val); \ - pr_buf(&out, "\n"); \ - return out.pos - buf; \ - } \ + if (attr == &sysfs_ ## file) \ + prt_human_readable_s64(out, val); \ } while (0) #define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) @@ -133,7 +163,10 @@ do { \ } while (0) write_attribute(trigger_gc); +write_attribute(trigger_discards); +write_attribute(trigger_invalidates); write_attribute(prune_cache); +write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); rw_attribute(gc_gens_pos); @@ -142,7 +175,7 @@ read_attribute(minor); read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); -read_attribute(durability); +rw_attribute(durability); read_attribute(iodone); read_attribute(io_latency_read); @@ -153,16 +186,12 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(journal_pins); read_attribute(btree_updates); -read_attribute(dirty_btree_nodes); read_attribute(btree_cache); read_attribute(btree_key_cache); -read_attribute(btree_transactions); read_attribute(stripes_heap); read_attribute(open_buckets); @@ -170,11 +199,10 @@ read_attribute(internal_uuid); read_attribute(has_data); read_attribute(alloc_debug); -write_attribute(wake_allocator); -read_attribute(read_realloc_races); -read_attribute(extent_migrate_done); -read_attribute(extent_migrate_raced); +#define x(t, n, ...) read_attribute(t); +BCH_PERSISTENT_COUNTERS() +#undef x rw_attribute(discard); rw_attribute(label); @@ -237,12 +265,12 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->data_progress_lock); list_for_each_entry(stats, &c->data_progress_list, list) { - pr_buf(out, "%s: data type %s btree_id %s position: ", + prt_printf(out, "%s: data type %s btree_id %s position: ", stats->name, bch2_data_types[stats->data_type], bch2_btree_ids[stats->btree_id]); bch2_bpos_to_text(out, stats->pos); - pr_buf(out, "%s", "\n"); + prt_printf(out, "%s", "\n"); } mutex_unlock(&c->data_progress_lock); @@ -270,7 +298,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c bch2_trans_init(&trans, c, 0, 0); for (id = 0; id < BTREE_ID_NR; id++) { - if (!((1U << id) & BTREE_ID_HAS_PTRS)) + if (!btree_type_has_ptrs(id)) continue; for_each_btree_key(&trans, iter, id, POS_MIN, @@ -315,40 +343,54 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - pr_buf(out, "uncompressed:\n"); - pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); - pr_buf(out, " size: "); - bch2_hprint(out, uncompressed_sectors << 9); - pr_buf(out, "\n"); - - pr_buf(out, "compressed:\n"); - pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); - pr_buf(out, " compressed size: "); - bch2_hprint(out, compressed_sectors_compressed << 9); - pr_buf(out, "\n"); - pr_buf(out, " uncompressed size: "); - bch2_hprint(out, compressed_sectors_uncompressed << 9); - pr_buf(out, "\n"); - - pr_buf(out, "incompressible:\n"); - pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); - pr_buf(out, " size: "); - bch2_hprint(out, incompressible_sectors << 9); - pr_buf(out, "\n"); + prt_printf(out, "uncompressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, uncompressed_sectors << 9); + prt_printf(out, "\n"); + + prt_printf(out, "compressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); + prt_printf(out, " compressed size: "); + prt_human_readable_u64(out, compressed_sectors_compressed << 9); + prt_printf(out, "\n"); + prt_printf(out, " uncompressed size: "); + prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); + prt_printf(out, "\n"); + + prt_printf(out, "incompressible:\n"); + prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, incompressible_sectors << 9); + prt_printf(out, "\n"); return 0; } static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); bch2_bpos_to_text(out, c->gc_gens_pos); - pr_buf(out, "\n"); + prt_printf(out, "\n"); +} + +static void bch2_btree_wakeup_all(struct bch_fs *c) +{ + struct btree_trans *trans; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); + + if (b) + six_lock_wakeup_all(&b->lock); + + } + mutex_unlock(&c->btree_trans_lock); } SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); @@ -356,19 +398,10 @@ SHOW(bch2_fs) sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); - sysfs_print(read_realloc_races, - atomic_long_read(&c->read_realloc_races)); - sysfs_print(extent_migrate_done, - atomic_long_read(&c->extent_migrate_done)); - sysfs_print(extent_migrate_raced, - atomic_long_read(&c->extent_migrate_raced)); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) { - bch2_gc_gens_pos_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_gc_gens_pos) + bch2_gc_gens_pos_to_text(out, c); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -378,83 +411,45 @@ SHOW(bch2_fs) max(0LL, c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)) << 9); - if (attr == &sysfs_rebalance_work) { - bch2_rebalance_work_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_rebalance_work) + bch2_rebalance_work_to_text(out, c); sysfs_print(promote_whole_extents, c->promote_whole_extents); /* Debugging: */ - if (attr == &sysfs_journal_debug) { - bch2_journal_debug_to_text(&out, &c->journal); - return out.pos - buf; - } - - if (attr == &sysfs_journal_pins) { - bch2_journal_pins_to_text(&out, &c->journal); - return out.pos - buf; - } + if (attr == &sysfs_journal_debug) + bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) { - bch2_btree_updates_to_text(&out, c); - return out.pos - buf; - } - - if (attr == &sysfs_dirty_btree_nodes) { - bch2_dirty_btree_nodes_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_btree_updates) + bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) { - bch2_btree_cache_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_btree_cache) + bch2_btree_cache_to_text(out, &c->btree_cache); - if (attr == &sysfs_btree_key_cache) { - bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); - return out.pos - buf; - } + if (attr == &sysfs_btree_key_cache) + bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - if (attr == &sysfs_btree_transactions) { - bch2_btree_trans_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_stripes_heap) + bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_stripes_heap) { - bch2_stripes_heap_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c); - if (attr == &sysfs_open_buckets) { - bch2_open_buckets_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); - if (attr == &sysfs_compression_stats) { - bch2_compression_stats_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_new_stripes) + bch2_new_stripes_to_text(out, c); - if (attr == &sysfs_new_stripes) { - bch2_new_stripes_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_io_timers_read) + bch2_io_timers_to_text(out, &c->io_clock[READ]); - if (attr == &sysfs_io_timers_read) { - bch2_io_timers_to_text(&out, &c->io_clock[READ]); - return out.pos - buf; - } - if (attr == &sysfs_io_timers_write) { - bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); - return out.pos - buf; - } + if (attr == &sysfs_io_timers_write) + bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - if (attr == &sysfs_data_jobs) { - data_progress_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_data_jobs) + data_progress_to_text(out, c); return 0; } @@ -510,6 +505,9 @@ STORE(bch2_fs) c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); } + if (attr == &sysfs_btree_wakeup) + bch2_btree_wakeup_all(c); + if (attr == &sysfs_trigger_gc) { /* * Full gc is currently incompatible with btree key cache: @@ -523,6 +521,12 @@ STORE(bch2_fs) #endif } + if (attr == &sysfs_trigger_discards) + bch2_do_discards(c); + + if (attr == &sysfs_trigger_invalidates) + bch2_do_invalidates(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -562,12 +566,54 @@ struct attribute *bch2_fs_files[] = { NULL }; +/* counters dir */ + +SHOW(bch2_fs_counters) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); + u64 counter = 0; + u64 counter_since_mount = 0; + + printbuf_tabstop_push(out, 32); + + #define x(t, ...) \ + if (attr == &sysfs_##t) { \ + counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ + counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + prt_printf(out, "since mount:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter_since_mount << 9); \ + prt_newline(out); \ + \ + prt_printf(out, "since filesystem creation:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter << 9); \ + prt_newline(out); \ + } + BCH_PERSISTENT_COUNTERS() + #undef x + return 0; +} + +STORE(bch2_fs_counters) { + return 0; +} + +SYSFS_OPS(bch2_fs_counters); + +struct attribute *bch2_fs_counters_files[] = { +#define x(t, ...) \ + &sysfs_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; /* internal dir - just a wrapper */ SHOW(bch2_fs_internal) { struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - return bch2_fs_show(&c->kobj, attr, buf); + return bch2_fs_to_text(out, &c->kobj, attr); } STORE(bch2_fs_internal) @@ -579,12 +625,9 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_debug, - &sysfs_journal_pins, &sysfs_btree_updates, - &sysfs_dirty_btree_nodes, &sysfs_btree_cache, &sysfs_btree_key_cache, - &sysfs_btree_transactions, &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, @@ -592,11 +635,10 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_write, &sysfs_trigger_gc, + &sysfs_trigger_discards, + &sysfs_trigger_invalidates, &sysfs_prune_cache, - - &sysfs_read_realloc_races, - &sysfs_extent_migrate_done, - &sysfs_extent_migrate_raced, + &sysfs_btree_wakeup, &sysfs_gc_gens_pos, @@ -617,16 +659,15 @@ struct attribute *bch2_fs_internal_files[] = { SHOW(bch2_fs_opts_dir) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); int id = opt - bch2_opt_table; u64 v = bch2_opt_get_by_id(&c->opts, id); - bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); - pr_buf(&out, "\n"); + bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); + prt_char(out, '\n'); - return out.pos - buf; + return 0; } STORE(bch2_fs_opts_dir) @@ -637,19 +678,28 @@ STORE(bch2_fs_opts_dir) char *tmp; u64 v; + /* + * We don't need to take c->writes for correctness, but it eliminates an + * unsightly error message in the dmesg log when we're RO: + */ + if (unlikely(!percpu_ref_tryget_live(&c->writes))) + return -EROFS; + tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) - return -ENOMEM; + if (!tmp) { + ret = -ENOMEM; + goto err; + } - ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); kfree(tmp); if (ret < 0) - return ret; + goto err; ret = bch2_opt_check_may_set(c, id, v); if (ret < 0) - return ret; + goto err; bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); @@ -660,7 +710,10 @@ STORE(bch2_fs_opts_dir) rebalance_wakeup(c); } - return size; + ret = size; +err: + percpu_ref_put(&c->writes); + return ret; } SYSFS_OPS(bch2_fs_opts_dir); @@ -690,13 +743,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - struct printbuf out = _PBUF(buf, PAGE_SIZE); #define x(name) \ - if (attr == &sysfs_time_stat_##name) { \ - bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ - return out.pos - buf; \ - } + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); BCH_TIME_STATS() #undef x @@ -717,24 +767,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -{ - enum alloc_reserve i; - - spin_lock(&ca->fs->freelist_lock); - - pr_buf(out, "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - for (i = 0; i < RESERVE_NR; i++) - pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); - - spin_unlock(&ca->fs->freelist_lock); -} - static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; @@ -746,23 +778,19 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; - pr_buf(out, - "\t\t buckets\t sectors fragmented\n" - "capacity%16llu\n", + prt_printf(out, + "\t\t\t buckets\t sectors fragmented\n" + "capacity\t%16llu\n", ca->mi.nbuckets - ca->mi.first_bucket); - for (i = 1; i < BCH_DATA_NR; i++) - pr_buf(out, "%-8s%16llu%16llu%16llu\n", + for (i = 0; i < BCH_DATA_NR; i++) + prt_printf(out, "%-16s%16llu%16llu%16llu\n", bch2_data_types[i], stats.d[i].buckets, stats.d[i].sectors, stats.d[i].fragmented); - pr_buf(out, - "ec\t%16llu\n" - "available%15llu\n" + prt_printf(out, + "ec\t\t%16llu\n" "\n" - "free_inc\t\t%zu/%zu\n" - "free[RESERVE_MOVINGGC]\t%zu/%zu\n" - "free[RESERVE_NONE]\t%zu/%zu\n" "freelist_wait\t\t%s\n" "open buckets allocated\t%u\n" "open buckets this dev\t%u\n" @@ -770,13 +798,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "open_buckets_wait\t%s\n" "open_buckets_btree\t%u\n" "open_buckets_user\t%u\n" - "btree reserve cache\t%u\n" - "thread state:\t\t%s\n", + "buckets_to_invalidate\t%llu\n" + "btree reserve cache\t%u\n", stats.buckets_ec, - __dev_buckets_available(ca, stats), - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, c->freelist_wait.list.first ? "waiting" : "empty", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ca->nr_open_buckets, @@ -784,8 +808,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) c->open_buckets_wait.list.first ? "waiting" : "empty", nr[BCH_DATA_btree], nr[BCH_DATA_user], - c->btree_reserve_cache_nr, - bch2_allocator_states[ca->allocator_state]); + should_invalidate_buckets(ca, stats), + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -799,10 +823,10 @@ static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) int rw, i; for (rw = 0; rw < 2; rw++) { - pr_buf(out, "%s:\n", bch2_rw[rw]); + prt_printf(out, "%s:\n", bch2_rw[rw]); for (i = 1; i < BCH_DATA_NR; i++) - pr_buf(out, "%-12s:%12llu\n", + prt_printf(out, "%-12s:%12llu\n", bch2_data_types[i], percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } @@ -812,7 +836,6 @@ SHOW(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_printf(uuid, "%pU\n", ca->uuid.b); @@ -825,58 +848,42 @@ SHOW(bch2_dev) if (attr == &sysfs_label) { if (ca->mi.group) { mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(&out, &c->disk_sb, + bch2_disk_path_to_text(out, c->disk_sb.sb, ca->mi.group - 1); mutex_unlock(&c->sb_lock); } - pr_buf(&out, "\n"); - return out.pos - buf; + prt_char(out, '\n'); } if (attr == &sysfs_has_data) { - bch2_flags_to_text(&out, bch2_data_types, - bch2_dev_has_data(c, ca)); - pr_buf(&out, "\n"); - return out.pos - buf; + prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_char(out, '\n'); } if (attr == &sysfs_state_rw) { - bch2_string_opt_to_text(&out, bch2_member_states, - ca->mi.state); - pr_buf(&out, "\n"); - return out.pos - buf; + prt_string_option(out, bch2_member_states, ca->mi.state); + prt_char(out, '\n'); } - if (attr == &sysfs_iodone) { - dev_iodone_to_text(&out, ca); - return out.pos - buf; - } + if (attr == &sysfs_iodone) + dev_iodone_to_text(out, ca); sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - if (attr == &sysfs_io_latency_stats_read) { - bch2_time_stats_to_text(&out, &ca->io_latency[READ]); - return out.pos - buf; - } - if (attr == &sysfs_io_latency_stats_write) { - bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); - return out.pos - buf; - } + if (attr == &sysfs_io_latency_stats_read) + bch2_time_stats_to_text(out, &ca->io_latency[READ]); + + if (attr == &sysfs_io_latency_stats_write) + bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_reserve_stats) { - reserve_stats_to_text(&out, ca); - return out.pos - buf; - } - if (attr == &sysfs_alloc_debug) { - dev_alloc_debug_to_text(&out, ca); - return out.pos - buf; - } + if (attr == &sysfs_alloc_debug) + dev_alloc_debug_to_text(out, ca); return 0; } @@ -900,6 +907,19 @@ STORE(bch2_dev) mutex_unlock(&c->sb_lock); } + if (attr == &sysfs_durability) { + u64 v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + + if (v != BCH_MEMBER_DURABILITY(mi)) { + SET_BCH_MEMBER_DURABILITY(mi, v + 1); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + if (attr == &sysfs_label) { char *tmp; int ret; @@ -914,9 +934,6 @@ STORE(bch2_dev) return ret; } - if (attr == &sysfs_wake_allocator) - bch2_wake_allocator(ca); - return size; } SYSFS_OPS(bch2_dev); @@ -942,11 +959,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - &sysfs_reserve_stats, - /* debug: */ &sysfs_alloc_debug, - &sysfs_wake_allocator, NULL }; diff --git a/libbcachefs/sysfs.h b/libbcachefs/sysfs.h index 525fd05..222cd50 100644 --- a/libbcachefs/sysfs.h +++ b/libbcachefs/sysfs.h @@ -10,28 +10,32 @@ struct attribute; struct sysfs_ops; extern struct attribute *bch2_fs_files[]; +extern struct attribute *bch2_fs_counters_files[]; extern struct attribute *bch2_fs_internal_files[]; extern struct attribute *bch2_fs_opts_dir_files[]; extern struct attribute *bch2_fs_time_stats_files[]; extern struct attribute *bch2_dev_files[]; -extern struct sysfs_ops bch2_fs_sysfs_ops; -extern struct sysfs_ops bch2_fs_internal_sysfs_ops; -extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -extern struct sysfs_ops bch2_dev_sysfs_ops; +extern const struct sysfs_ops bch2_fs_sysfs_ops; +extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; +extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; +extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +extern const struct sysfs_ops bch2_dev_sysfs_ops; int bch2_opts_create_sysfs_files(struct kobject *); #else static struct attribute *bch2_fs_files[] = {}; +static struct attribute *bch2_fs_counters_files[] = {}; static struct attribute *bch2_fs_internal_files[] = {}; static struct attribute *bch2_fs_opts_dir_files[] = {}; static struct attribute *bch2_fs_time_stats_files[] = {}; static struct attribute *bch2_dev_files[] = {}; static const struct sysfs_ops bch2_fs_sysfs_ops; +static const struct sysfs_ops bch2_fs_counters_sysfs_ops; static const struct sysfs_ops bch2_fs_internal_sysfs_ops; static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index de84ce8..d058861 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -15,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_extents, - POS_MIN, SPOS_MAX, - BTREE_ITER_ALL_SNAPSHOTS, + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS_MIN, SPOS_MAX, - BTREE_ITER_ALL_SNAPSHOTS, - NULL); + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); BUG_ON(ret); } @@ -43,29 +42,29 @@ static int test_delete(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { - bch_err(c, "update error in test_delete: %i", ret); + bch_err(c, "update error in test_delete: %s", bch2_err_str(ret)); goto err; } pr_info("deleting once"); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "delete error (first) in test_delete: %i", ret); + bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret)); goto err; } pr_info("deleting twice"); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "delete error (second) in test_delete: %i", ret); + bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret)); goto err; } err: @@ -89,22 +88,22 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { - bch_err(c, "update error in test_delete_written: %i", ret); + bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret)); goto err; } bch2_trans_unlock(&trans); bch2_journal_flush_all_pins(&c->journal); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "delete error in test_delete_written: %i", ret); + bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret)); goto err; } err: @@ -137,7 +136,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "insert error in test_iterate: %i", ret); + bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret)); goto err; } } @@ -146,20 +145,30 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, ret) { - if (k.k->p.inode) - break; - + ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + goto err; } BUG_ON(i != nr); pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) - BUG_ON(k.k->p.offset != --i); + ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != --i); + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); + goto err; + } BUG_ON(i); err: @@ -193,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "insert error in test_iterate_extents: %i", ret); + bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret)); goto err; } } @@ -202,19 +211,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0, k, ret) { + ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + goto err; } BUG_ON(i != nr); pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { - BUG_ON(k.k->p.offset != i); - i = bkey_start_offset(k.k); + ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); + goto err; } BUG_ON(i); @@ -248,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "insert error in test_iterate_slots: %i", ret); + bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret)); goto err; } } @@ -257,15 +278,16 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, ret) { - if (k.k->p.inode) - break; - + ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i += 2; + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + goto err; } - bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr * 2); @@ -273,17 +295,23 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS, k, ret) { + ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; + BUG_ON(k.k->p.offset != i); BUG_ON(bkey_deleted(k.k) != (i & 1)); i++; - if (i == nr * 2) - break; + 0; + })); + if (ret < 0) { + bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); + goto err; } - bch2_trans_iter_exit(&trans, &iter); + ret = 0; err: bch2_trans_exit(&trans); return ret; @@ -314,7 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); + bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret)); goto err; } } @@ -323,13 +351,17 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0, k, ret) { + ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + goto err; } - bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr); @@ -337,19 +369,23 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS, k, ret) { + ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); BUG_ON(bkey_start_offset(k.k) != i); BUG_ON(k.k->size != 8); i = k.k->p.offset; - - if (i == nr) - break; + 0; + })); + if (ret) { + bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); + goto err; } - bch2_trans_iter_exit(&trans, &iter); + ret = 0; err: bch2_trans_exit(&trans); return 0; @@ -369,10 +405,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); BUG_ON(k.k); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); BUG_ON(k.k); bch2_trans_iter_exit(&trans, &iter); @@ -390,10 +426,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); BUG_ON(k.k); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); BUG_ON(k.k); bch2_trans_iter_exit(&trans, &iter); @@ -420,7 +456,7 @@ static int insert_test_extent(struct bch_fs *c, ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) - bch_err(c, "insert error in insert_test_extent: %i", ret); + bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret)); return ret; } @@ -483,7 +519,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bch2_trans_init(&trans, c, 0, 0); bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -519,7 +555,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr) ret = test_snapshot_filter(c, snapids[0], snapids[1]); if (ret) { - bch_err(c, "err %i from test_snapshot_filter", ret); + bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret)); return ret; } @@ -553,10 +589,10 @@ static int rand_insert(struct bch_fs *c, u64 nr) k.k.p.offset = test_rand(); k.k.p.snapshot = U32_MAX; - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); if (ret) { - bch_err(c, "error in rand_insert: %i", ret); + bch_err(c, "error in rand_insert: %s", bch2_err_str(ret)); break; } } @@ -582,7 +618,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) k[j].k.p.snapshot = U32_MAX; } - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: @@ -592,7 +628,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); if (ret) { - bch_err(c, "error in rand_insert_multi: %i", ret); + bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret)); break; } } @@ -616,10 +652,10 @@ static int rand_lookup(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ret = bkey_err(k); if (ret) { - bch_err(c, "error in rand_lookup: %i", ret); + bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret)); break; } } @@ -641,8 +677,8 @@ static int rand_mixed_trans(struct btree_trans *trans, k = bch2_btree_iter_peek(iter); ret = bkey_err(k); - if (ret && ret != -EINTR) - bch_err(trans->c, "lookup error in rand_mixed: %i", ret); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret)); if (ret) return ret; @@ -669,10 +705,10 @@ static int rand_mixed(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { rand = test_rand(); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, rand_mixed_trans(&trans, &iter, &cookie, i, rand)); if (ret) { - bch_err(c, "update error in rand_mixed: %i", ret); + bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret)); break; } } @@ -690,7 +726,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ret = bkey_err(k); if (ret) goto err; @@ -715,10 +751,10 @@ static int rand_delete(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { struct bpos pos = SPOS(0, test_rand(), U32_MAX); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); if (ret) { - bch_err(c, "error in rand_delete: %i", ret); + bch_err(c, "error in rand_delete: %s", bch2_err_str(ret)); break; } } @@ -734,28 +770,23 @@ static int seq_insert(struct bch_fs *c, u64 nr) struct bkey_s_c k; struct bkey_i_cookie insert; int ret = 0; - u64 i = 0; bkey_cookie_init(&insert.k_i); bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - insert.k.p = iter.pos; - - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &insert.k_i, 0)); - if (ret) { - bch_err(c, "error in seq_insert: %i", ret); - break; - } - - if (++i == nr) - break; - } - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + NULL, NULL, 0, + ({ + if (iter.pos.offset >= nr) + break; + insert.k.p = iter.pos; + bch2_trans_update(&trans, &iter, &insert.k_i, 0); + })); + if (ret) + bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); bch2_trans_exit(&trans); return ret; @@ -770,10 +801,11 @@ static int seq_lookup(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, ret) - ; - bch2_trans_iter_exit(&trans, &iter); + ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, + 0); + if (ret) + bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); bch2_trans_exit(&trans); return ret; @@ -788,22 +820,18 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_INTENT, k, ret) { - struct bkey_i_cookie u; - - bkey_reassemble(&u.k_i, k); + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_INTENT, k, + NULL, NULL, 0, + ({ + struct bkey_i_cookie u; - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &u.k_i, 0)); - if (ret) { - bch_err(c, "error in seq_overwrite: %i", ret); - break; - } - } - bch2_trans_iter_exit(&trans, &iter); + bkey_reassemble(&u.k_i, k); + bch2_trans_update(&trans, &iter, &u.k_i, 0); + })); + if (ret) + bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); bch2_trans_exit(&trans); return ret; @@ -814,11 +842,10 @@ static int seq_delete(struct bch_fs *c, u64 nr) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS_MIN, SPOS_MAX, - BTREE_ITER_ALL_SNAPSHOTS, - NULL); + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); if (ret) - bch_err(c, "error in seq_delete: %i", ret); + bch_err(c, "error in seq_delete: %s", bch2_err_str(ret)); return ret; } @@ -855,7 +882,7 @@ static int btree_perf_test_thread(void *data) ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); if (ret) { - bch_err(j->c, "%ps: error %i", j->fn, ret); + bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); j->ret = ret; } @@ -871,7 +898,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, u64 nr, unsigned nr_threads) { struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; - char name_buf[20], nr_buf[20], per_sec_buf[20]; + char name_buf[20]; + struct printbuf nr_buf = PRINTBUF; + struct printbuf per_sec_buf = PRINTBUF; unsigned i; u64 time; @@ -932,13 +961,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, time = j.finish - j.start; scnprintf(name_buf, sizeof(name_buf), "%s:", testname); - bch2_hprint(&PBUF(nr_buf), nr); - bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); + prt_human_readable_u64(&nr_buf, nr); + prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", - name_buf, nr_buf, nr_threads, + name_buf, nr_buf.buf, nr_threads, div_u64(time, NSEC_PER_SEC), div_u64(time * nr_threads, nr), - per_sec_buf); + per_sec_buf.buf); + printbuf_exit(&per_sec_buf); + printbuf_exit(&nr_buf); return j.ret; } diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c index 59e8dfa..7057398 100644 --- a/libbcachefs/trace.c +++ b/libbcachefs/trace.c @@ -2,11 +2,13 @@ #include "bcachefs.h" #include "alloc_types.h" #include "buckets.h" -#include "btree_types.h" +#include "btree_iter.h" +#include "btree_locking.h" #include "keylist.h" +#include "opts.h" #include -#include "keylist.h" +#include #define CREATE_TRACE_POINTS #include diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 0bbea33..62fa662 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -21,22 +22,26 @@ #include #include #include +#include #include "eytzinger.h" #include "util.h" static const char si_units[] = "?kMGTPEZY"; -static int __bch2_strtoh(const char *cp, u64 *res, - u64 t_max, bool t_signed) +/* string_get_size units: */ +static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" +}; +static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" +}; + +static int parse_u64(const char *cp, u64 *res) { - bool positive = *cp != '-'; - unsigned u; + const char *start = cp; u64 v = 0; - if (*cp == '+' || *cp == '-') - cp++; - if (!isdigit(*cp)) return -EINVAL; @@ -50,22 +55,122 @@ static int __bch2_strtoh(const char *cp, u64 *res, cp++; } while (isdigit(*cp)); + *res = v; + return cp - start; +} + +static int bch2_pow(u64 n, u64 p, u64 *res) +{ + *res = 1; + + while (p--) { + if (*res > div_u64(U64_MAX, n)) + return -ERANGE; + *res *= n; + } + return 0; +} + +static int parse_unit_suffix(const char *cp, u64 *res) +{ + const char *start = cp; + u64 base = 1024; + unsigned u; + int ret; + + if (*cp == ' ') + cp++; + for (u = 1; u < strlen(si_units); u++) if (*cp == si_units[u]) { cp++; goto got_unit; } - u = 0; + + for (u = 0; u < ARRAY_SIZE(units_2); u++) + if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { + cp += strlen(units_2[u]); + goto got_unit; + } + + for (u = 0; u < ARRAY_SIZE(units_10); u++) + if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { + cp += strlen(units_10[u]); + base = 1000; + goto got_unit; + } + + *res = 1; + return 0; got_unit: - if (*cp == '\n') + ret = bch2_pow(base, u, res); + if (ret) + return ret; + + return cp - start; +} + +#define parse_or_ret(cp, _f) \ +do { \ + int ret = _f; \ + if (ret < 0) \ + return ret; \ + cp += ret; \ +} while (0) + +static int __bch2_strtou64_h(const char *cp, u64 *res) +{ + const char *start = cp; + u64 v = 0, b, f_n = 0, f_d = 1; + int ret; + + parse_or_ret(cp, parse_u64(cp, &v)); + + if (*cp == '.') { cp++; - if (*cp) - return -EINVAL; + ret = parse_u64(cp, &f_n); + if (ret < 0) + return ret; + cp += ret; + + ret = bch2_pow(10, ret, &f_d); + if (ret) + return ret; + } + + parse_or_ret(cp, parse_unit_suffix(cp, &b)); + + if (v > div_u64(U64_MAX, b)) + return -ERANGE; + v *= b; + + if (f_n > div_u64(U64_MAX, b)) + return -ERANGE; - if (fls64(v) + u * 10 > 64) + f_n = div_u64(f_n * b, f_d); + if (v + f_n < v) return -ERANGE; + v += f_n; - v <<= u * 10; + *res = v; + return cp - start; +} + +static int __bch2_strtoh(const char *cp, u64 *res, + u64 t_max, bool t_signed) +{ + bool positive = *cp != '-'; + u64 v = 0; + + if (*cp == '+' || *cp == '-') + cp++; + + parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); + + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; if (positive) { if (v > t_max) @@ -86,7 +191,7 @@ got_unit: #define STRTO_H(name, type) \ int bch2_ ## name ## _h(const char *cp, type *res) \ { \ - u64 v; \ + u64 v = 0; \ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ANYSINT_MAX(type) != ((type) ~0ULL)); \ *res = v; \ @@ -99,58 +204,6 @@ STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) STRTO_H(strtou64, u64) -void bch2_hprint(struct printbuf *buf, s64 v) -{ - int u, t = 0; - - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0U << 10); - v >>= 10; - } - - pr_buf(buf, "%lli", v); - - /* - * 103 is magic: t is in the range [-1023, 1023] and we want - * to turn it into [-9, 9] - */ - if (u && t && v < 100 && v > -100) - pr_buf(buf, ".%i", t / 103); - if (u) - pr_buf(buf, "%c", si_units[u]); -} - -void bch2_string_opt_to_text(struct printbuf *out, - const char * const list[], - size_t selected) -{ - size_t i; - - for (i = 0; list[i]; i++) - pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); -} - -void bch2_flags_to_text(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - if (out->pos != out->end) - *out->pos = '\0'; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs(flags)) < nr) { - if (!first) - pr_buf(out, ","); - first = false; - pr_buf(out, "%s", list[bit]); - flags ^= 1 << bit; - } -} - u64 bch2_read_flag_list(char *opt, const char * const list[]) { u64 ret = 0; @@ -217,45 +270,98 @@ static void bch2_quantiles_update(struct quantiles *q, u64 v) } } -/* time stats: */ +void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +{ + while (nr_bits) + prt_char(out, '0' + ((v >> --nr_bits) & 1)); +} -static void bch2_time_stats_update_one(struct time_stats *stats, - u64 start, u64 end) +void bch2_print_string_as_lines(const char *prefix, const char *lines) { - u64 duration, freq; + const char *p; - duration = time_after64(end, start) - ? end - start : 0; - freq = time_after64(end, stats->last_event) - ? end - stats->last_event : 0; + if (!lines) { + printk("%s (null)\n", prefix); + return; + } - stats->count++; + console_lock(); + while (1) { + p = strchrnul(lines, '\n'); + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; + lines = p + 1; + prefix = KERN_CONT; + } + console_unlock(); +} - stats->average_duration = stats->average_duration - ? ewma_add(stats->average_duration, duration, 6) - : duration; +int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task) +{ + unsigned long entries[32]; + unsigned i, nr_entries; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ret; + + nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); + for (i = 0; i < nr_entries; i++) { + prt_printf(out, "[<0>] %pB", (void *)entries[i]); + prt_newline(out); + } + + up_read(&task->signal->exec_update_lock); + return 0; +} - stats->average_frequency = stats->average_frequency - ? ewma_add(stats->average_frequency, freq, 6) - : freq; +/* time stats: */ - stats->max_duration = max(stats->max_duration, duration); +static void bch2_time_stats_update_one(struct time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; - stats->last_event = end; + if (time_after64(end, start)) { + duration = end - start; + stats->duration_stats = mean_and_variance_update(stats->duration_stats, + duration); + stats->duration_stats_weighted = mean_and_variance_weighted_update( + stats->duration_stats_weighted, + duration); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + bch2_quantiles_update(&stats->quantiles, duration); + } - bch2_quantiles_update(&stats->quantiles, duration); + if (time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq); + stats->freq_stats_weighted = mean_and_variance_weighted_update( + stats->freq_stats_weighted, + freq); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + stats->last_event = end; + } } void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) { unsigned long flags; + WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, + "time_stats: min_duration = %llu, min_freq = %llu", + stats->min_duration, stats->min_freq); + if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); bch2_time_stats_update_one(stats, start, end); - if (stats->average_frequency < 32 && - stats->count > 1024) + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, GFP_ATOMIC); @@ -290,12 +396,15 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) static const struct time_unit { const char *name; - u32 nsecs; + u64 nsecs; } time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "sec", NSEC_PER_SEC }, + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", NSEC_PER_SEC * 60}, + { "h", NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, }; static const struct time_unit *pick_time_units(u64 ns) @@ -315,41 +424,126 @@ static void pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); - pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); + prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); + prt_tab_rjust(out); + prt_printf(out, "%s", u->name); +} + +#define TABSTOP_SIZE 12 + +static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) +{ + prt_str(out, name); + prt_tab(out); + pr_time_units(out, ns); + prt_newline(out); } void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) { const struct time_unit *u; - u64 freq = READ_ONCE(stats->average_frequency); - u64 q, last_q = 0; + s64 f_mean = 0, d_mean = 0; + u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + /* + * avoid divide by zero + */ + if (stats->freq_stats.n) { + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } - pr_buf(out, "count:\t\t%llu\n", - stats->count); - pr_buf(out, "rate:\t\t%llu/sec\n", - freq ? div64_u64(NSEC_PER_SEC, freq) : 0); - - pr_buf(out, "frequency:\t"); - pr_time_units(out, freq); - - pr_buf(out, "\navg duration:\t"); - pr_time_units(out, stats->average_duration); - - pr_buf(out, "\nmax duration:\t"); - pr_time_units(out, stats->max_duration); + printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); + prt_printf(out, "count:"); + prt_tab(out); + prt_printf(out, "%llu ", + stats->duration_stats.n); + printbuf_tabstop_pop(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + printbuf_tabstop_push(out, 0); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + + prt_tab(out); + prt_printf(out, "since mount"); + prt_tab_rjust(out); + prt_tab(out); + prt_printf(out, "recent"); + prt_tab_rjust(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE); + printbuf_tabstop_push(out, 2); + printbuf_tabstop_push(out, TABSTOP_SIZE); + + prt_printf(out, "duration of events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_duration); + pr_name_and_units(out, "max:", stats->max_duration); + + prt_printf(out, "mean:"); + prt_tab(out); + pr_time_units(out, d_mean); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + pr_time_units(out, d_stddev); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + prt_printf(out, "time between events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_freq); + pr_name_and_units(out, "max:", stats->max_freq); + + prt_printf(out, "mean:"); + prt_tab(out); + pr_time_units(out, f_mean); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + pr_time_units(out, f_stddev); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + printbuf_tabstops_reset(out); i = eytzinger0_first(NR_QUANTILES); u = pick_time_units(stats->quantiles.entries[i].m); - pr_buf(out, "\nquantiles (%s):\t", u->name); + prt_printf(out, "quantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; q = max(stats->quantiles.entries[i].m, last_q); - pr_buf(out, "%llu%s", - div_u64(q, u->nsecs), - is_last ? "\n" : " "); + prt_printf(out, "%llu ", + div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); last_q = q; } } @@ -362,6 +556,10 @@ void bch2_time_stats_exit(struct time_stats *stats) void bch2_time_stats_init(struct time_stats *stats) { memset(stats, 0, sizeof(*stats)); + stats->duration_stats_weighted.w = 8; + stats->freq_stats_weighted.w = 8; + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); } @@ -467,36 +665,45 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd) pd->backpressure = 1; } -size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) +void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) { - /* 2^64 - 1 is 20 digits, plus null byte */ - char rate[21]; - char actual[21]; - char target[21]; - char proportional[21]; - char derivative[21]; - char change[21]; - s64 next_io; - - bch2_hprint(&PBUF(rate), pd->rate.rate); - bch2_hprint(&PBUF(actual), pd->last_actual); - bch2_hprint(&PBUF(target), pd->last_target); - bch2_hprint(&PBUF(proportional), pd->last_proportional); - bch2_hprint(&PBUF(derivative), pd->last_derivative); - bch2_hprint(&PBUF(change), pd->last_change); - - next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); - - return sprintf(buf, - "rate:\t\t%s/sec\n" - "target:\t\t%s\n" - "actual:\t\t%s\n" - "proportional:\t%s\n" - "derivative:\t%s\n" - "change:\t\t%s/sec\n" - "next io:\t%llims\n", - rate, target, actual, proportional, - derivative, change, next_io); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + prt_printf(out, "rate:"); + prt_tab(out); + prt_human_readable_s64(out, pd->rate.rate); + prt_newline(out); + + prt_printf(out, "target:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_target); + prt_newline(out); + + prt_printf(out, "actual:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_actual); + prt_newline(out); + + prt_printf(out, "proportional:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_proportional); + prt_newline(out); + + prt_printf(out, "derivative:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_derivative); + prt_newline(out); + + prt_printf(out, "change:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_change); + prt_newline(out); + + prt_printf(out, "next io:"); + prt_tab(out); + prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); + prt_newline(out); } /* misc: */ @@ -579,21 +786,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -void bch_scnmemcpy(struct printbuf *out, - const char *src, size_t len) -{ - size_t n = printbuf_remaining(out); - - if (n) { - n = min(n - 1, len); - memcpy(out->pos, src, n); - out->pos += n; - *out->pos = '\0'; - } -} - -#include "eytzinger.h" - static int alignment_ok(const void *base, size_t align) { return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || diff --git a/libbcachefs/util.h b/libbcachefs/util.h index e55407d..846e602 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -11,12 +11,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include struct closure; @@ -210,9 +212,11 @@ do { \ \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ - heap_swap(h, _i, (h)->used, set_backpointer); \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - heap_sift_down(h, _i, cmp, set_backpointer); \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ + } \ } while (0) #define heap_pop(h, d, cmp, set_backpointer) \ @@ -235,54 +239,44 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -struct printbuf { - char *pos; - char *end; - unsigned indent; -}; -static inline size_t printbuf_remaining(struct printbuf *buf) +#ifdef __KERNEL__ +static inline void pr_time(struct printbuf *out, u64 time) { - return buf->end - buf->pos; + prt_printf(out, "%llu", time); } - -#define _PBUF(_buf, _len) \ - ((struct printbuf) { \ - .pos = _buf, \ - .end = _buf + _len, \ - }) - -#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) - -#define pr_buf(_out, ...) \ -do { \ - (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ - __VA_ARGS__); \ -} while (0) - -static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces) -{ - buf->indent += spaces; - while (spaces--) - pr_buf(buf, " "); +#else +#include +static inline void pr_time(struct printbuf *out, u64 _time) +{ + char time_str[64]; + time_t time = _time; + struct tm *tm = localtime(&time); + size_t err = strftime(time_str, sizeof(time_str), "%c", tm); + if (!err) + prt_printf(out, "(formatting error)"); + else + prt_printf(out, "%s", time_str); } +#endif -static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) { - buf->indent -= spaces; + sprintf(out, "%pUb", uuid); } +#else +#include +#endif -static inline void printbuf_newline(struct printbuf *buf) +static inline void pr_uuid(struct printbuf *out, u8 *uuid) { - unsigned i; + char uuid_str[40]; - pr_buf(buf, "\n"); - for (i = 0; i < buf->indent; i++) - pr_buf(buf, " "); + uuid_unparse_lower(uuid, uuid_str); + prt_printf(out, "%s", uuid_str); } -void bch_scnmemcpy(struct printbuf *, const char *, size_t); - int bch2_strtoint_h(const char *, int *); int bch2_strtouint_h(const char *, unsigned int *); int bch2_strtoll_h(const char *, long long *); @@ -345,8 +339,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res) _r; \ }) -#define snprint(buf, size, var) \ - snprintf(buf, size, \ +#define snprint(out, var) \ + prt_printf(out, \ type_is(var, int) ? "%i\n" \ : type_is(var, unsigned) ? "%u\n" \ : type_is(var, long) ? "%li\n" \ @@ -356,16 +350,15 @@ static inline int bch2_strtoul_h(const char *cp, long *res) : type_is(var, char *) ? "%s\n" \ : "%i\n", var) -void bch2_hprint(struct printbuf *, s64); - bool bch2_is_zero(const void *, size_t); -void bch2_string_opt_to_text(struct printbuf *, - const char * const [], size_t); - -void bch2_flags_to_text(struct printbuf *, const char * const[], u64); u64 bch2_read_flag_list(char *, const char * const[]); +void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); + +void bch2_print_string_as_lines(const char *prefix, const char *lines); +int bch2_prt_backtrace(struct printbuf *, struct task_struct *); + #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) @@ -388,14 +381,18 @@ struct time_stat_buffer { struct time_stats { spinlock_t lock; - u64 count; /* all fields are in nanoseconds */ - u64 average_duration; - u64 average_frequency; u64 max_duration; + u64 min_duration; + u64 max_freq; + u64 min_freq; u64 last_event; struct quantiles quantiles; + struct mean_and_variance duration_stats; + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance freq_stats; + struct mean_and_variance_weighted freq_stats_weighted; struct time_stat_buffer __percpu *buffer; }; @@ -463,7 +460,7 @@ struct bch_pd_controller { void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); void bch2_pd_controller_init(struct bch_pd_controller *); -size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); +void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); #define sysfs_pd_controller_attribute(name) \ rw_attribute(name##_rate); \ @@ -487,7 +484,7 @@ do { \ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ \ if (attr == &sysfs_##name##_rate_debug) \ - return bch2_pd_controller_print_debug(var, buf); \ + bch2_pd_controller_debug_to_text(out, var); \ } while (0) #define sysfs_pd_controller_store(name, var) \ @@ -700,6 +697,31 @@ do { \ #define array_remove_item(_array, _nr, _pos) \ array_remove_items(_array, _nr, _pos, 1) +static inline void __move_gap(void *array, size_t element_size, + size_t nr, size_t size, + size_t old_gap, size_t new_gap) +{ + size_t gap_end = old_gap + size - nr; + + if (new_gap < old_gap) { + size_t move = old_gap - new_gap; + + memmove(array + element_size * (gap_end - move), + array + element_size * (old_gap - move), + element_size * move); + } else if (new_gap > old_gap) { + size_t move = new_gap - old_gap; + + memmove(array + element_size * old_gap, + array + element_size * gap_end, + element_size * move); + } +} + +/* Move the gap in a gap buffer: */ +#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ + __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) + #define bubble_sort(_base, _nr, _cmp) \ do { \ ssize_t _i, _end; \ @@ -768,13 +790,4 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } -#ifdef __KERNEL__ -static inline void uuid_unparse_lower(u8 *uuid, char *out) -{ - sprintf(out, "%plU", uuid); -} -#else -#include -#endif - #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index a2d6bb7..5143b60 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h index c099cdc..53a694d 100644 --- a/libbcachefs/vstructs.h +++ b/libbcachefs/vstructs.h @@ -20,7 +20,7 @@ ({ \ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ \ - (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ + (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ }) #define vstruct_bytes(_s) \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 4d7db64..4fc1c3a 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -69,32 +69,51 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) - return "value too small"; + if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) { + prt_printf(err, "incorrect value size (%zu < %zu)", + bkey_val_bytes(k.k), sizeof(*xattr.v)); + return -EINVAL; + } if (bkey_val_u64s(k.k) < xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len))) - return "value too small"; + le16_to_cpu(xattr.v->x_val_len))) { + prt_printf(err, "value too small (%zu < %u)", + bkey_val_u64s(k.k), + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len))); + return -EINVAL; + } + /* XXX why +4 ? */ if (bkey_val_u64s(k.k) > xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4)) - return "value too big"; + le16_to_cpu(xattr.v->x_val_len) + 4)) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4)); + return -EINVAL; + } handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (!handler) - return "invalid type"; + if (!handler) { + prt_printf(err, "invalid type (%u)", xattr.v->x_type); + return -EINVAL; + } - if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) - return "xattr name has invalid characters"; + if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { + prt_printf(err, "xattr name has invalid characters"); + return -EINVAL; + } - return NULL; + return 0; } void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, @@ -105,17 +124,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, handler = bch2_xattr_type_to_handler(xattr.v->x_type); if (handler && handler->prefix) - pr_buf(out, "%s", handler->prefix); + prt_printf(out, "%s", handler->prefix); else if (handler) - pr_buf(out, "(type %u)", xattr.v->x_type); + prt_printf(out, "(type %u)", xattr.v->x_type); else - pr_buf(out, "(unknown type %u)", xattr.v->x_type); + prt_printf(out, "(unknown type %u)", xattr.v->x_type); - bch_scnmemcpy(out, xattr.v->x_name, - xattr.v->x_name_len); - pr_buf(out, ":"); - bch_scnmemcpy(out, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + prt_printf(out, "%.*s:%.*s", + xattr.v->x_name_len, + xattr.v->x_name, + le16_to_cpu(xattr.v->x_val_len), + (char *) xattr_val(xattr.v)); } static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, @@ -311,13 +330,9 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, - SPOS(inum, offset, snapshot), 0, k, ret) { - BUG_ON(k.k->p.inode < inum); - - if (k.k->p.inode > inum) - break; - + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), + POS(inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_xattr) continue; @@ -329,23 +344,25 @@ retry: offset = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); err: - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_exit(&trans); if (ret) - return ret; + goto out; ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); if (ret) - return ret; + goto out; ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); if (ret) - return ret; + goto out; return buf.used; +out: + return bch2_err_class(ret); } static int bch2_xattr_get_handler(const struct xattr_handler *handler, @@ -354,8 +371,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret; - return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); + ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags); + return bch2_err_class(ret); } static int bch2_xattr_set_handler(const struct xattr_handler *handler, @@ -367,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + int ret; - return bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); + return bch2_err_class(ret); } static const struct xattr_handler bch_xattr_user_handler = { @@ -426,9 +447,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); const struct bch_option *opt; int id, inode_opt_id; - char buf[512]; - struct printbuf out = PBUF(buf); - unsigned val_len; + struct printbuf out = PRINTBUF; + int ret; u64 v; id = bch2_opt_lookup(name); @@ -449,16 +469,21 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, return -ENODATA; v = bch2_opt_get_by_id(&opts, id); - bch2_opt_to_text(&out, c, opt, v, 0); + bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); - val_len = out.pos - buf; + ret = out.pos; - if (buffer && val_len > size) - return -ERANGE; + if (out.allocation_failure) { + ret = -ENOMEM; + } else if (buffer) { + if (out.pos > size) + ret = -ERANGE; + else + memcpy(buffer, out.buf, out.pos); + } - if (buffer) - memcpy(buffer, buf, val_len); - return val_len; + printbuf_exit(&out); + return ret; } static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, @@ -525,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(c, NULL, opt, buf, &v); + ret = bch2_opt_parse(c, opt, buf, &v, NULL); kfree(buf); if (ret < 0) diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index f4f8965..66d7a1e 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr (struct bkey_ops) { \ diff --git a/linux/bio.c b/linux/bio.c index 8422c26..93a791c 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -120,29 +120,30 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) } } -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); - bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; + return 0; } -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs) { - struct bio *b; + struct bio *bio; + + bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); + if (!bio) + return NULL; - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) + if (__bio_clone(bio, bio_src, gfp) < 0) { + bio_put(bio); return NULL; + } + bio->bi_io_vec = bio_src->bi_io_vec; - __bio_clone_fast(b, bio); - return b; + return bio; } struct bio *bio_split(struct bio *bio, int sectors, @@ -153,15 +154,7 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); - /* - * Discards need a mutable bio_vec to accommodate the payload - * required by the DSM TRIM and UNMAP commands. - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - split = bio_clone_bioset(bio, gfp, bs); - else - split = bio_clone_fast(bio, gfp, bs); - + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return NULL; @@ -188,9 +181,16 @@ void bio_advance(struct bio *bio, unsigned bytes) static void bio_free(struct bio *bio) { - unsigned front_pad = bio->bi_pool ? bio->bi_pool->front_pad : 0; + struct bio_set *bs = bio->bi_pool; + + if (bs) { + if (bio->bi_max_vecs > BIO_INLINE_VECS) + mempool_free(bio->bi_io_vec, &bs->bvec_pool); - kfree((void *) bio - front_pad); + mempool_free((void *) bio - bs->front_pad, &bs->bio_pool); + } else { + kfree(bio); + } } void bio_put(struct bio *bio) @@ -282,64 +282,114 @@ again: bio->bi_end_io(bio); } -void bio_reset(struct bio *bio) +void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; + bio->bi_bdev = bdev; + bio->bi_opf = opf; + bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask) { - unsigned front_pad = bs ? bs->front_pad : 0; struct bio *bio; - void *p; - - p = kmalloc(front_pad + - sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - if (unlikely(!p)) + bio = kmalloc(sizeof(struct bio) + + sizeof(struct bio_vec) * nr_iovecs, gfp_mask); + if (unlikely(!bio)) return NULL; + bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, 0); + bio->bi_pool = NULL; + return bio; +} - bio = p + front_pad; - bio_init(bio, bio->bi_inline_vecs, nr_iovecs); - bio->bi_pool = bs; +static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs, + gfp_t gfp_mask) +{ + *nr_vecs = roundup_pow_of_two(*nr_vecs); + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_VECS entries. + */ + if (*nr_vecs < BIO_MAX_VECS) { + struct bio_vec *bvl; - return bio; + bvl = kmalloc(sizeof(*bvl) * *nr_vecs, gfp_mask); + if (likely(bvl)) + return bvl; + *nr_vecs = BIO_MAX_VECS; + } + + return mempool_alloc(pool, gfp_mask); } -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, +struct bio *bio_alloc_bioset(struct block_device *bdev, + unsigned nr_iovecs, + unsigned opf, + gfp_t gfp_mask, struct bio_set *bs) { - struct bvec_iter iter; - struct bio_vec bv; struct bio *bio; + void *p; - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) + if (nr_iovecs > BIO_MAX_VECS) + return NULL; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (unlikely(!p)) return NULL; - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_opf = bio_src->bi_opf; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; + + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + if (unlikely(!bvl)) + goto err_free; + + bio_init(bio, bdev, bvl, nr_iovecs, opf); + } else if (nr_iovecs) { + bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + } else { + bio_init(bio, bdev, NULL, 0, opf); } + bio->bi_pool = bs; return bio; + +err_free: + mempool_free(p, &bs->bio_pool); + return NULL; +} + +void bioset_exit(struct bio_set *bs) +{ + mempool_exit(&bs->bio_pool); + mempool_exit(&bs->bvec_pool); +} + +int bioset_init(struct bio_set *bs, + unsigned int pool_size, + unsigned int front_pad, + int flags) +{ + int ret; + + bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; + + ret = mempool_init_kmalloc_pool(&bs->bio_pool, pool_size, bs->front_pad + + sizeof(struct bio) + bs->back_pad) ?: + mempool_init_kmalloc_pool(&bs->bvec_pool, pool_size, + sizeof(struct bio_vec) * BIO_MAX_VECS); + if (ret) + bioset_exit(bs); + return ret; } diff --git a/linux/blkdev.c b/linux/blkdev.c index 762e5aa..9b3ea93 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -113,7 +113,7 @@ int submit_bio_wait(struct bio *bio) int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, - gfp_t gfp_mask, unsigned long flags) + gfp_t gfp_mask) { return 0; } @@ -128,12 +128,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev) BUG_ON(ret); if (!S_ISBLK(statbuf.st_mode)) - return statbuf.st_blksize >> 9; + return statbuf.st_blksize; - ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize); - BUG_ON(ret); - - return blksize >> 9; + xioctl(bdev->bd_fd, BLKPBSZGET, &blksize); + return blksize; } sector_t get_capacity(struct gendisk *disk) @@ -168,7 +166,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; - int fd, sync_fd, flags = O_DIRECT; + int fd, sync_fd, buffered_fd, flags = 0; if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE)) flags = O_RDWR; @@ -183,16 +181,12 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, flags |= O_EXCL; #endif - fd = open(path, flags); + fd = open(path, flags|O_DIRECT); if (fd < 0) return ERR_PTR(-errno); - sync_fd = open(path, flags|O_SYNC); - if (sync_fd < 0) { - assert(0); - close(fd); - return ERR_PTR(-errno); - } + sync_fd = xopen(path, flags|O_DIRECT|O_SYNC); + buffered_fd = xopen(path, flags); bdev = malloc(sizeof(*bdev)); memset(bdev, 0, sizeof(*bdev)); @@ -203,6 +197,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, bdev->bd_dev = xfstat(fd).st_rdev; bdev->bd_fd = fd; bdev->bd_sync_fd = sync_fd; + bdev->bd_buffered_fd = buffered_fd; bdev->bd_holder = holder; bdev->bd_disk = &bdev->__bd_disk; bdev->bd_disk->bdi = &bdev->bd_disk->__bdi; diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c index 7857017..41f1bcd 100644 --- a/linux/generic-radix-tree.c +++ b/linux/generic-radix-tree.c @@ -3,6 +3,7 @@ #include #include #include +#include #define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) @@ -37,12 +38,12 @@ static inline size_t genradix_depth_size(unsigned depth) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) -unsigned genradix_root_to_depth(struct genradix_root *r) +static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } -struct genradix_node *genradix_root_to_node(struct genradix_root *r) +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } @@ -76,6 +77,27 @@ void *__genradix_ptr(struct __genradix *radix, size_t offset) } EXPORT_SYMBOL(__genradix_ptr); +static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) +{ + struct genradix_node *node; + + node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); + + /* + * We're using pages (not slab allocations) directly for kernel data + * structures, so we need to explicitly inform kmemleak of them in order + * to avoid false positive memory leak reports. + */ + kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); + return node; +} + +static inline void genradix_free_node(struct genradix_node *node) +{ + kmemleak_free(node); + free_page((unsigned long)node); +} + /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: @@ -98,8 +120,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, break; if (!new_node) { - new_node = (void *) - __get_free_page(gfp_mask|__GFP_ZERO); + new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } @@ -122,8 +143,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, n = READ_ONCE(*p); if (!n) { if (!new_node) { - new_node = (void *) - __get_free_page(gfp_mask|__GFP_ZERO); + new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } @@ -134,7 +154,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, } if (new_node) - free_page((unsigned long) new_node); + genradix_free_node(new_node); return &n->data[offset]; } @@ -193,6 +213,64 @@ restart: } EXPORT_SYMBOL(__genradix_iter_peek); +void *__genradix_iter_peek_prev(struct genradix_iter *iter, + struct __genradix *radix, + size_t objs_per_page, + size_t obj_size_plus_page_remainder) +{ + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + +restart: + r = READ_ONCE(radix->root); + if (!r) + return NULL; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (ilog2(iter->offset) >= genradix_depth_shift(level)) { + iter->offset = genradix_depth_size(level); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + } + + while (level) { + level--; + + i = (iter->offset >> genradix_depth_shift(level)) & + (GENRADIX_ARY - 1); + + while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + iter->offset = round_down(iter->offset, objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + if (!iter->offset) + return NULL; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + + if (!i) + goto restart; + --i; + } + + n = n->children[i]; + } + + return &n->data[iter->offset & (PAGE_SIZE - 1)]; +} +EXPORT_SYMBOL(__genradix_iter_peek_prev); + static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { @@ -203,7 +281,7 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level) genradix_free_recurse(n->children[i], level - 1); } - free_page((unsigned long) n); + genradix_free_node(n); } int __genradix_prealloc(struct __genradix *radix, size_t size, diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c new file mode 100644 index 0000000..a8170bb --- /dev/null +++ b/linux/int_sqrt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ + +#include +#include +#include +#include + +/** + * int_sqrt - computes the integer square root + * @x: integer of which to calculate the sqrt + * + * Computes: floor(sqrt(x)) + */ +unsigned long int_sqrt(unsigned long x) +{ + unsigned long b, m, y = 0; + + if (x <= 1) + return x; + + m = 1UL << (__fls(x) & ~1UL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt); + +#if BITS_PER_LONG < 64 +/** + * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input + * is expected. + * @x: 64bit integer of which to calculate the sqrt + */ +u32 int_sqrt64(u64 x) +{ + u64 b, m, y = 0; + + if (x <= ULONG_MAX) + return int_sqrt((unsigned long) x); + + m = 1ULL << ((fls64(x) - 1) & ~1ULL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt64); +#endif diff --git a/linux/kthread.c b/linux/kthread.c index 41bfca2..3c7bdb8 100644 --- a/linux/kthread.c +++ b/linux/kthread.c @@ -71,8 +71,10 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data), p->thread_fn = thread_fn; p->thread_data = thread_data; p->state = TASK_UNINTERRUPTIBLE; + p->signal = &p->_signal; atomic_set(&p->usage, 1); init_completion(&p->exited); + init_rwsem(&p->_signal.exec_update_lock); pthread_attr_t attr; pthread_attr_init(&attr); diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c new file mode 100644 index 0000000..643e311 --- /dev/null +++ b/linux/mean_and_variance.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions for incremental mean and variance. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Copyright © 2022 Daniel B. Hill + * + * Author: Daniel B. Hill + * + * Description: + * + * This is includes some incremental algorithms for mean and variance calculation + * + * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + * + * Create a struct and if it's the weighted variant set the w field (weight = 2^k). + * + * Use mean_and_variance[_weighted]_update() on the struct to update it's state. + * + * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation + * is deferred to these functions for performance reasons. + * + * see lib/math/mean_and_variance_test.c for examples of usage. + * + * DO NOT access the mean and variance fields of the weighted variants directly. + * DO NOT change the weight after calling update. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} + +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ +struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1) +{ + struct mean_and_variance s2; + u64 v2 = abs(v1); + + s2.n = s1.n + 1; + s2.sum = s1.sum + v1; + s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2)); + return s2; +} +EXPORT_SYMBOL_GPL(mean_and_variance_update); + +/** + * mean_and_variance_get_mean() - get mean from @s + */ +s64 mean_and_variance_get_mean(struct mean_and_variance s) +{ + return div64_u64(s.sum, s.n); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); + +/** + * mean_and_variance_get_variance() - get variance from @s1 + * + * see linked pdf equation 12. + */ +u64 mean_and_variance_get_variance(struct mean_and_variance s1) +{ + u128 s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); + + return u128_to_u64(u128_sub(s2, u128_square(s3))); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); + +/** + * mean_and_variance_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_get_stddev(struct mean_and_variance s) +{ + return int_sqrt64(mean_and_variance_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); + +/** + * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() + * @s1: .. + * @s2: .. + * + * see linked pdf: function derived from equations 140-143 where alpha = 2^w. + * values are stored bitshifted for performance and added precision. + */ +struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, + s64 x) +{ + struct mean_and_variance_weighted s2; + // previous weighted variance. + u64 var_w0 = s1.variance; + u8 w = s2.w = s1.w; + // new value weighted. + s64 x_w = x << w; + s64 diff_w = x_w - s1.mean; + s64 diff = fast_divpow2(diff_w, w); + // new mean weighted. + s64 u_w1 = s1.mean + diff; + + BUG_ON(w % 2 != 0); + + if (!s1.init) { + s2.mean = x_w; + s2.variance = 0; + } else { + s2.mean = u_w1; + s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + } + s2.init = true; + + return s2; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); + +/** + * mean_and_variance_weighted_get_mean() - get mean from @s + */ +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +{ + return fast_divpow2(s.mean, s.w); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); + +/** + * mean_and_variance_weighted_get_variance() -- get variance from @s + */ +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +{ + // always positive don't need fast divpow2 + return s.variance >> s.w; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); + +/** + * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +{ + return int_sqrt64(mean_and_variance_weighted_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/linux/pretty-printers.c b/linux/pretty-printers.c new file mode 100644 index 0000000..addbac9 --- /dev/null +++ b/linux/pretty-printers.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: LGPL-2.1+ +/* Copyright (C) 2022 Kent Overstreet */ + +#include +#include +#include +#include + +/** + * prt_string_option - Given a list of strings, print out the list and indicate + * which option is selected, with square brackets (sysfs style) + * + * @out: The printbuf to output to + * @list: List of strings to choose from + * @selected: The option to highlight, with square brackets + */ +void prt_string_option(struct printbuf *out, + const char * const list[], + size_t selected) +{ + size_t i; + + for (i = 0; list[i]; i++) { + if (i) + prt_char(out, ' '); + if (i == selected) + prt_char(out, '['); + prt_str(out, list[i]); + if (i == selected) + prt_char(out, ']'); + } +} +EXPORT_SYMBOL(prt_string_option); + +/** + * prt_bitflags: Given a bitmap and a list of names for each bit, print out which + * bits are on, comma separated + * + * @out: The printbuf to output to + * @list: List of names for each bit + * @flags: Bits to print + */ +void prt_bitflags(struct printbuf *out, + const char * const list[], u64 flags) +{ + unsigned bit, nr = 0; + bool first = true; + + while (list[nr]) + nr++; + + while (flags && (bit = __ffs(flags)) < nr) { + if (!first) + prt_char(out, ','); + first = false; + prt_str(out, list[bit]); + flags ^= 1 << bit; + } +} +EXPORT_SYMBOL(prt_bitflags); diff --git a/linux/printbuf.c b/linux/printbuf.c new file mode 100644 index 0000000..5cf79d4 --- /dev/null +++ b/linux/printbuf.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: LGPL-2.1+ +/* Copyright (C) 2022 Kent Overstreet */ + +#include +#include +#include +#include +#include +#include + +static inline unsigned printbuf_linelen(struct printbuf *buf) +{ + return buf->pos - buf->last_newline; +} + +int printbuf_make_room(struct printbuf *out, unsigned extra) +{ + unsigned new_size; + char *buf; + + if (!out->heap_allocated) + return 0; + + /* Reserved space for terminating nul: */ + extra += 1; + + if (out->pos + extra < out->size) + return 0; + + new_size = roundup_pow_of_two(out->size + extra); + + /* + * Note: output buffer must be freeable with kfree(), it's not required + * that the user use printbuf_exit(). + */ + buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + + if (!buf) { + out->allocation_failure = true; + return -ENOMEM; + } + + out->buf = buf; + out->size = new_size; + return 0; +} +EXPORT_SYMBOL(printbuf_make_room); + +/** + * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null + * terminated + */ +const char *printbuf_str(const struct printbuf *buf) +{ + /* + * If we've written to a printbuf then it's guaranteed to be a null + * terminated string - but if we haven't, then we might not have + * allocated a buffer at all: + */ + return buf->pos + ? buf->buf + : ""; +} +EXPORT_SYMBOL(printbuf_str); + +/** + * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it + * against accidental use. + */ +void printbuf_exit(struct printbuf *buf) +{ + if (buf->heap_allocated) { + kfree(buf->buf); + buf->buf = ERR_PTR(-EINTR); /* poison value */ + } +} +EXPORT_SYMBOL(printbuf_exit); + +void printbuf_tabstops_reset(struct printbuf *buf) +{ + buf->nr_tabstops = 0; +} +EXPORT_SYMBOL(printbuf_tabstops_reset); + +void printbuf_tabstop_pop(struct printbuf *buf) +{ + if (buf->nr_tabstops) + --buf->nr_tabstops; +} +EXPORT_SYMBOL(printbuf_tabstop_pop); + +/* + * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop + * + * @buf: printbuf to control + * @spaces: number of spaces from previous tabpstop + * + * In the future this function may allocate memory if setting more than + * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start + * of line. + */ +int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) +{ + unsigned prev_tabstop = buf->nr_tabstops + ? buf->_tabstops[buf->nr_tabstops - 1] + : 0; + + if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) + return -EINVAL; + + buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; + buf->has_indent_or_tabstops = true; + return 0; +} +EXPORT_SYMBOL(printbuf_tabstop_push); + +/** + * printbuf_indent_add - add to the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to add to the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces more spaces. + */ +void printbuf_indent_add(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) + spaces = 0; + + buf->indent += spaces; + prt_chars(buf, ' ', spaces); + + buf->has_indent_or_tabstops = true; +} +EXPORT_SYMBOL(printbuf_indent_add); + +/** + * printbuf_indent_sub - subtract from the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to subtract from the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces less spaces. + */ +void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(spaces > buf->indent)) + spaces = buf->indent; + + if (buf->last_newline + buf->indent == buf->pos) { + buf->pos -= spaces; + printbuf_nul_terminate(buf); + } + buf->indent -= spaces; + + if (!buf->indent && !buf->nr_tabstops) + buf->has_indent_or_tabstops = false; +} +EXPORT_SYMBOL(printbuf_indent_sub); + +void prt_newline(struct printbuf *buf) +{ + unsigned i; + + printbuf_make_room(buf, 1 + buf->indent); + + __prt_char(buf, '\n'); + + buf->last_newline = buf->pos; + + for (i = 0; i < buf->indent; i++) + __prt_char(buf, ' '); + + printbuf_nul_terminate(buf); + + buf->last_field = buf->pos; + buf->cur_tabstop = 0; +} +EXPORT_SYMBOL(prt_newline); + +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} + +static void __prt_tab(struct printbuf *out) +{ + int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); + + prt_chars(out, ' ', spaces); + + out->last_field = out->pos; + out->cur_tabstop++; +} + +/** + * prt_tab - Advance printbuf to the next tabstop + * + * @buf: printbuf to control + * + * Advance output to the next tabstop by printing spaces. + */ +void prt_tab(struct printbuf *out) +{ + if (WARN_ON(!cur_tabstop(out))) + return; + + __prt_tab(out); +} +EXPORT_SYMBOL(prt_tab); + +static void __prt_tab_rjust(struct printbuf *buf) +{ + unsigned move = buf->pos - buf->last_field; + int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); + + if (pad > 0) { + printbuf_make_room(buf, pad); + + if (buf->last_field + pad < buf->size) + memmove(buf->buf + buf->last_field + pad, + buf->buf + buf->last_field, + min(move, buf->size - 1 - buf->last_field - pad)); + + if (buf->last_field < buf->size) + memset(buf->buf + buf->last_field, ' ', + min((unsigned) pad, buf->size - buf->last_field)); + + buf->pos += pad; + printbuf_nul_terminate(buf); + } + + buf->last_field = buf->pos; + buf->cur_tabstop++; +} + +/** + * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying + * previous output + * + * @buf: printbuf to control + * + * Advance output to the next tabstop by inserting spaces immediately after the + * previous tabstop, right justifying previously outputted text. + */ +void prt_tab_rjust(struct printbuf *buf) +{ + if (WARN_ON(!cur_tabstop(buf))) + return; + + __prt_tab_rjust(buf); +} +EXPORT_SYMBOL(prt_tab_rjust); + +/** + * prt_bytes_indented - Print an array of chars, handling embedded control characters + * + * @out: printbuf to output to + * @str: string to print + * @count: number of bytes to print + * + * The following contol characters are handled as so: + * \n: prt_newline newline that obeys current indent level + * \t: prt_tab advance to next tabstop + * \r: prt_tab_rjust advance to next tabstop, with right justification + */ +void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) +{ + const char *unprinted_start = str; + const char *end = str + count; + + if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { + prt_bytes(out, str, count); + return; + } + + while (str != end) { + switch (*str) { + case '\n': + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + prt_newline(out); + break; + case '\t': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab(out); + } + break; + case '\r': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab_rjust(out); + } + break; + } + + str++; + } + + prt_bytes(out, unprinted_start, str - unprinted_start); +} +EXPORT_SYMBOL(prt_bytes_indented); + +/** + * prt_human_readable_u64 - Print out a u64 in human readable units + * + * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + */ +void prt_human_readable_u64(struct printbuf *buf, u64 v) +{ + printbuf_make_room(buf, 10); + buf->pos += string_get_size(v, 1, !buf->si_units, + buf->buf + buf->pos, + printbuf_remaining_size(buf)); +} +EXPORT_SYMBOL(prt_human_readable_u64); + +/** + * prt_human_readable_s64 - Print out a s64 in human readable units + * + * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + */ +void prt_human_readable_s64(struct printbuf *buf, s64 v) +{ + if (v < 0) + prt_char(buf, '-'); + prt_human_readable_u64(buf, abs(v)); +} +EXPORT_SYMBOL(prt_human_readable_s64); + +/** + * prt_units_u64 - Print out a u64 according to printbuf unit options + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void prt_units_u64(struct printbuf *out, u64 v) +{ + if (out->human_readable_units) + prt_human_readable_u64(out, v); + else + prt_printf(out, "%llu", v); +} +EXPORT_SYMBOL(prt_units_u64); + +/** + * prt_units_s64 - Print out a s64 according to printbuf unit options + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void prt_units_s64(struct printbuf *out, s64 v) +{ + if (v < 0) + prt_char(out, '-'); + prt_units_u64(out, abs(v)); +} +EXPORT_SYMBOL(prt_units_s64); diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c new file mode 100644 index 0000000..df9567c --- /dev/null +++ b/linux/printbuf_userspace.c @@ -0,0 +1,29 @@ + +#include +#include + +void prt_vprintf(struct printbuf *out, const char *fmt, va_list args) +{ + int len; + + do { + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + } while (len + 1 >= printbuf_remaining(out) && + !printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); +} diff --git a/linux/ratelimit.c b/linux/ratelimit.c new file mode 100644 index 0000000..21a6d6c --- /dev/null +++ b/linux/ratelimit.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ratelimit.c - Do something with rate limit. + * + * Isolated from kernel/printk.c by Dave Young + * + * 2008-05-01 rewrite the function and use a ratelimit_state data struct as + * parameter. Now every user can use their own standalone ratelimit_state. + */ + +#include +#include +#include + +/* + * __ratelimit - rate limiting + * @rs: ratelimit_state data + * @func: name of calling function + * + * This enforces a rate limit: not more than @rs->burst callbacks + * in every @rs->interval + * + * RETURNS: + * 0 means callbacks will be suppressed. + * 1 means go ahead and do it. + */ +int ___ratelimit(struct ratelimit_state *rs, const char *func) +{ + int ret; + + if (!rs->interval) + return 1; + + /* + * If we contend on this state's lock then almost + * by definition we are too busy to print a message, + * in addition to the one that will be printed by + * the entity that is holding the lock already: + */ + if (!raw_spin_trylock(&rs->lock)) + return 0; + + if (!rs->begin) + rs->begin = jiffies; + + if (time_is_before_jiffies(rs->begin + rs->interval)) { + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + printk(KERN_WARNING + "%s: %d callbacks suppressed\n", + func, rs->missed); + rs->missed = 0; + } + } + rs->begin = jiffies; + rs->printed = 0; + } + if (rs->burst && rs->burst > rs->printed) { + rs->printed++; + ret = 1; + } else { + rs->missed++; + ret = 0; + } + raw_spin_unlock(&rs->lock); + + return ret; +} +EXPORT_SYMBOL(___ratelimit); diff --git a/linux/shrinker.c b/linux/shrinker.c index f6c979a..23e288d 100644 --- a/linux/shrinker.c +++ b/linux/shrinker.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -10,7 +11,7 @@ static LIST_HEAD(shrinker_list); static DEFINE_MUTEX(shrinker_lock); -int register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) { mutex_lock(&shrinker_lock); list_add_tail(&shrinker->list, &shrinker_list); @@ -39,46 +40,71 @@ static u64 parse_meminfo_line(const char *line) return v << 10; } -static struct meminfo read_meminfo(void) +void si_meminfo(struct sysinfo *val) { - struct meminfo ret = { 0 }; size_t len, n = 0; char *line = NULL; const char *v; FILE *f; + memset(val, 0, sizeof(*val)); + val->mem_unit = 1; + f = fopen("/proc/meminfo", "r"); if (!f) - return ret; + return; while ((len = getline(&line, &n, f)) != -1) { if ((v = strcmp_prefix(line, "MemTotal:"))) - ret.total = parse_meminfo_line(v); + val->totalram = parse_meminfo_line(v); if ((v = strcmp_prefix(line, "MemAvailable:"))) - ret.available = parse_meminfo_line(v); + val->freeram = parse_meminfo_line(v); } fclose(f); free(line); +} + +static void run_shrinkers_allocation_failed(gfp_t gfp_mask) +{ + struct shrinker *shrinker; + + mutex_lock(&shrinker_lock); + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { .gfp_mask = gfp_mask, }; + + unsigned long have = shrinker->count_objects(shrinker, &sc); - return ret; + sc.nr_to_scan = have / 8; + + shrinker->scan_objects(shrinker, &sc); + } + mutex_unlock(&shrinker_lock); } -void run_shrinkers(void) +void run_shrinkers(gfp_t gfp_mask, bool allocation_failed) { struct shrinker *shrinker; - struct meminfo info; + struct sysinfo info; s64 want_shrink; + if (!(gfp_mask & GFP_KERNEL)) + return; + /* Fast out if there are no shrinkers to run. */ if (list_empty(&shrinker_list)) return; - info = read_meminfo(); + if (allocation_failed) { + run_shrinkers_allocation_failed(gfp_mask); + return; + } + + si_meminfo(&info); - if (info.total && info.available) { - want_shrink = (info.total >> 2) - info.available; + if (info.totalram && info.freeram) { + want_shrink = (info.totalram >> 2) - info.freeram; if (want_shrink <= 0) return; @@ -92,7 +118,8 @@ void run_shrinkers(void) mutex_lock(&shrinker_lock); list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { - .nr_to_scan = want_shrink >> PAGE_SHIFT + .gfp_mask = gfp_mask, + .nr_to_scan = want_shrink >> PAGE_SHIFT }; shrinker->scan_objects(shrinker, &sc); diff --git a/linux/six.c b/linux/six.c index fca1208..39f7ea7 100644 --- a/linux/six.c +++ b/linux/six.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,9 +17,11 @@ #define EBUG_ON(cond) do {} while (0) #endif -#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) +#define six_acquire(l, t, r) lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_) #define six_release(l) lock_release(l, _RET_IP_) +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); + struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ u64 lock_val; @@ -65,14 +68,15 @@ struct six_lock_vals { } static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - union six_lock_state old) + union six_lock_state old, + struct task_struct *owner) { if (type != SIX_LOCK_intent) return; if (!old.intent_lock) { EBUG_ON(lock->owner); - lock->owner = current; + lock->owner = owner; } else { EBUG_ON(lock->owner != current); } @@ -88,64 +92,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock) return read_count; } -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; -}; - /* This is probably up there with the more evil things I've done */ #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - unsigned waitlist_id) -{ - if (waitlist_id == SIX_LOCK_write) { - if (state.write_locking && !state.read_lock) { - struct task_struct *p = READ_ONCE(lock->owner); - if (p) - wake_up_process(p); - } - } else { - struct list_head *wait_list = &lock->wait_list[waitlist_id]; - struct six_lock_waiter *w, *next; - - if (!(state.waiters & (1 << waitlist_id))) - return; - - clear_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, wait_list, list) { - list_del_init(&w->list); - - if (wake_up_process(w->task) && - waitlist_id != SIX_LOCK_read) { - if (!list_empty(wait_list)) - set_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - break; - } - } - - raw_spin_unlock(&lock->wait_lock); - } -} - -static __always_inline bool do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - bool try) +static int __do_six_trylock_type(struct six_lock *lock, + enum six_lock_type type, + struct task_struct *task, + bool try) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old, new; - bool ret; + int ret; u64 v; - EBUG_ON(type == SIX_LOCK_write && lock->owner != current); + EBUG_ON(type == SIX_LOCK_write && lock->owner != task); EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); - EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); /* @@ -164,7 +125,6 @@ static __always_inline bool do_six_trylock_type(struct six_lock *lock, */ if (type == SIX_LOCK_read && lock->readers) { -retry: preempt_disable(); this_cpu_inc(*lock->readers); /* signal that we own lock */ @@ -181,38 +141,21 @@ retry: * lock, issue a wakeup because we might have caused a * spurious trylock failure: */ - if (old.write_locking) { - struct task_struct *p = READ_ONCE(lock->owner); - - if (p) - wake_up_process(p); - } - - /* - * If we failed from the lock path and the waiting bit wasn't - * set, set it: - */ - if (!try && !ret) { - v = old.v; - - do { - new.v = old.v = v; - - if (!(old.v & l[type].lock_fail)) - goto retry; - - if (new.waiters & (1 << type)) - break; - - new.waiters |= 1 << type; - } while ((v = atomic64_cmpxchg(&lock->state.counter, - old.v, new.v)) != old.v); - } + if (old.write_locking) + ret = -1 - SIX_LOCK_write; } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); smp_mb__after_atomic(); + } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) { + atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write), + &lock->state.counter); + /* + * pairs with barrier after unlock and before checking + * for readers in unlock path + */ + smp_mb__after_atomic(); } ret = !pcpu_read_count(lock); @@ -229,7 +172,8 @@ retry: if (try && !ret) { old.v = atomic64_add_return(v, &lock->state.counter); - six_lock_wakeup(lock, old, SIX_LOCK_read); + if (old.waiters & (1 << SIX_LOCK_read)) + ret = -1 - SIX_LOCK_read; } else { atomic64_add(v, &lock->state.counter); } @@ -243,8 +187,7 @@ retry: if (type == SIX_LOCK_write) new.write_locking = 0; - } else if (!try && type != SIX_LOCK_write && - !(new.waiters & (1 << type))) + } else if (!try && !(new.waiters & (1 << type))) new.waiters |= 1 << type; else break; /* waiting bit already set */ @@ -256,14 +199,84 @@ retry: EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); } - if (ret) - six_set_owner(lock, type, old); + if (ret > 0) + six_set_owner(lock, type, old, task); - EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); + EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking)); return ret; } +static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +{ + struct six_lock_waiter *w, *next; + struct task_struct *task; + bool saw_one; + int ret; +again: + ret = 0; + saw_one = false; + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, &lock->wait_list, list) { + if (w->lock_want != lock_type) + continue; + + if (saw_one && lock_type != SIX_LOCK_read) + goto unlock; + saw_one = true; + + ret = __do_six_trylock_type(lock, lock_type, w->task, false); + if (ret <= 0) + goto unlock; + + __list_del(w->list.prev, w->list.next); + task = w->task; + /* + * Do no writes to @w besides setting lock_acquired - otherwise + * we would need a memory barrier: + */ + barrier(); + w->lock_acquired = true; + wake_up_process(task); + } + + clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v); +unlock: + raw_spin_unlock(&lock->wait_lock); + + if (ret < 0) { + lock_type = -ret - 1; + goto again; + } +} + +static inline void six_lock_wakeup(struct six_lock *lock, + union six_lock_state state, + enum six_lock_type lock_type) +{ + if (lock_type == SIX_LOCK_write && state.read_lock) + return; + + if (!(state.waiters & (1 << lock_type))) + return; + + __six_lock_wakeup(lock, lock_type); +} + +static bool do_six_trylock_type(struct six_lock *lock, + enum six_lock_type type, + bool try) +{ + int ret; + + ret = __do_six_trylock_type(lock, type, current, try); + if (ret < 0) + __six_lock_wakeup(lock, -ret - 1); + + return ret > 0; +} + __always_inline __flatten static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) { @@ -271,7 +284,7 @@ static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) return false; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return true; } @@ -304,15 +317,11 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, * Similar to the lock path, we may have caused a spurious write * lock fail and need to issue a wakeup: */ - if (old.write_locking) { - struct task_struct *p = READ_ONCE(lock->owner); - - if (p) - wake_up_process(p); - } + if (old.write_locking) + six_lock_wakeup(lock, old, SIX_LOCK_write); if (ret) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return ret; } @@ -327,41 +336,34 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, old.v, old.v + l[type].lock_val)) != old.v); - six_set_owner(lock, type, old); + six_set_owner(lock, type, old, current); if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return true; } #ifdef CONFIG_LOCK_SPIN_ON_OWNER -static inline int six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait) { - struct task_struct *owner; - int retval = 1; + struct task_struct *owner, *task = current; - if (need_resched()) - return 0; + switch (wait->lock_want) { + case SIX_LOCK_read: + break; + case SIX_LOCK_intent: + if (lock->wait_list.next != &wait->list) + return false; + break; + case SIX_LOCK_write: + return false; + } rcu_read_lock(); owner = READ_ONCE(lock->owner); - if (owner) - retval = owner->on_cpu; - rcu_read_unlock(); - /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. - */ - return retval; -} - -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner) -{ - bool ret = true; - rcu_read_lock(); - while (lock->owner == owner) { + while (owner && lock->owner == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -370,85 +372,27 @@ static inline bool six_spin_on_owner(struct six_lock *lock, */ barrier(); - if (!owner->on_cpu || need_resched()) { - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; - - if (type == SIX_LOCK_write) - return false; - - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; - - if (!osq_lock(&lock->osq)) - goto fail; - - while (1) { - struct task_struct *owner; - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner)) - break; - - if (do_six_trylock_type(lock, type, false)) { - osq_unlock(&lock->osq); - preempt_enable(); - return true; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let + * If we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(task))) + if (wait->lock_acquired || + !owner->on_cpu || + rt_task(task) || + need_resched()) break; - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ cpu_relax(); } + rcu_read_unlock(); - osq_unlock(&lock->osq); -fail: - preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - - return false; + return wait->lock_acquired; } #else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait) { return false; } @@ -457,10 +401,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type noinline static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, six_lock_should_sleep_fn should_sleep_fn, void *p) { union six_lock_state old; - struct six_lock_waiter wait; int ret = 0; if (type == SIX_LOCK_write) { @@ -469,47 +413,73 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty smp_mb__after_atomic(); } - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (ret) - goto out_before_sleep; + lock_contended(&lock->dep_map, _RET_IP_); - if (six_optimistic_spin(lock, type)) - goto out_before_sleep; + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; - lock_contended(&lock->dep_map, _RET_IP_); + raw_spin_lock(&lock->wait_lock); + if (!(lock->state.waiters & (1 << type))) + set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v); + /* + * Retry taking the lock after taking waitlist lock, have raced with an + * unlock: + */ + ret = __do_six_trylock_type(lock, type, current, false); + if (ret <= 0) { + wait->start_time = local_clock(); - INIT_LIST_HEAD(&wait.list); - wait.task = current; + if (!list_empty(&lock->wait_list)) { + struct six_lock_waiter *last = + list_last_entry(&lock->wait_list, + struct six_lock_waiter, list); + + if (time_before_eq64(wait->start_time, last->start_time)) + wait->start_time = last->start_time + 1; + } + + list_add_tail(&wait->list, &lock->wait_list); + } + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(ret > 0)) { + ret = 0; + goto out; + } + + if (unlikely(ret < 0)) { + __six_lock_wakeup(lock, -ret - 1); + ret = 0; + } + + if (six_optimistic_spin(lock, wait)) + goto out; while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (type == SIX_LOCK_write) - EBUG_ON(lock->owner != current); - else if (list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_add_tail(&wait.list, &lock->wait_list[type]); - raw_spin_unlock(&lock->wait_lock); - } - if (do_six_trylock_type(lock, type, false)) + if (wait->lock_acquired) break; ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (ret) + if (unlikely(ret)) { + raw_spin_lock(&lock->wait_lock); + if (!wait->lock_acquired) + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + + if (wait->lock_acquired) + do_six_unlock_type(lock, type); break; + } schedule(); } __set_current_state(TASK_RUNNING); - - if (!list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_del_init(&wait.list); - raw_spin_unlock(&lock->wait_lock); - } -out_before_sleep: - if (ret && type == SIX_LOCK_write) { +out: + if (ret && type == SIX_LOCK_write && lock->state.write_locking) { old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), &lock->state.counter); six_lock_wakeup(lock, old, SIX_LOCK_read); @@ -518,17 +488,20 @@ out_before_sleep: return ret; } -__always_inline -static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) +__always_inline __flatten +static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) { int ret; + wait->start_time = 0; + if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); ret = do_six_trylock_type(lock, type, true) ? 0 - : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); + : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p); if (ret && type != SIX_LOCK_write) six_release(&lock->dep_map); @@ -538,28 +511,23 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, return ret; } +__always_inline +static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p); +} + __always_inline __flatten -static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state state; - EBUG_ON(type == SIX_LOCK_write && - !(lock->state.v & __SIX_LOCK_HELD_intent)); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map); - - if (type == SIX_LOCK_intent) { - EBUG_ON(lock->owner != current); - - if (lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - + if (type == SIX_LOCK_intent) lock->owner = NULL; - } if (type == SIX_LOCK_read && lock->readers) { @@ -576,6 +544,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) six_lock_wakeup(lock, state, l[type].unlock_wakeup); } +__always_inline __flatten +static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + EBUG_ON(type == SIX_LOCK_write && + !(lock->state.v & __SIX_LOCK_HELD_intent)); + EBUG_ON((type == SIX_LOCK_write || + type == SIX_LOCK_intent) && + lock->owner != current); + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map); + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + do_six_unlock_type(lock, type); +} + #define __SIX_LOCK(type) \ bool six_trylock_##type(struct six_lock *lock) \ { \ @@ -596,6 +585,14 @@ int six_lock_##type(struct six_lock *lock, \ } \ EXPORT_SYMBOL_GPL(six_lock_##type); \ \ +int six_lock_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p)\ +{ \ + return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\ +} \ +EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \ + \ void six_unlock_##type(struct six_lock *lock) \ { \ __six_unlock_type(lock, SIX_LOCK_##type); \ @@ -639,7 +636,7 @@ bool six_lock_tryupgrade(struct six_lock *lock) if (lock->readers) this_cpu_dec(*lock->readers); - six_set_owner(lock, SIX_LOCK_intent, old); + six_set_owner(lock, SIX_LOCK_intent, old, current); return true; } @@ -671,7 +668,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; - six_acquire(&lock->dep_map, 0); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); /* XXX: assert already locked, and that we don't overflow: */ @@ -698,47 +695,20 @@ EXPORT_SYMBOL_GPL(six_lock_increment); void six_lock_wakeup_all(struct six_lock *lock) { + union six_lock_state state = lock->state; struct six_lock_waiter *w; - raw_spin_lock(&lock->wait_lock); + six_lock_wakeup(lock, state, SIX_LOCK_read); + six_lock_wakeup(lock, state, SIX_LOCK_intent); + six_lock_wakeup(lock, state, SIX_LOCK_write); - list_for_each_entry(w, &lock->wait_list[0], list) - wake_up_process(w->task); - list_for_each_entry(w, &lock->wait_list[1], list) + raw_spin_lock(&lock->wait_lock); + list_for_each_entry(w, &lock->wait_list, list) wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -struct free_pcpu_rcu { - struct rcu_head rcu; - void __percpu *p; -}; - -static void free_pcpu_rcu_fn(struct rcu_head *_rcu) -{ - struct free_pcpu_rcu *rcu = - container_of(_rcu, struct free_pcpu_rcu, rcu); - - free_percpu(rcu->p); - kfree(rcu); -} - -void six_lock_pcpu_free_rcu(struct six_lock *lock) -{ - struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); - - if (!rcu) - return; - - rcu->p = lock->readers; - lock->readers = NULL; - - call_rcu(&rcu->rcu, free_pcpu_rcu_fn); -} -EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); - void six_lock_pcpu_free(struct six_lock *lock) { BUG_ON(lock->readers && pcpu_read_count(lock)); @@ -757,3 +727,27 @@ void six_lock_pcpu_alloc(struct six_lock *lock) #endif } EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); + +/* + * Returns lock held counts, for both read and intent + */ +struct six_lock_count six_lock_counts(struct six_lock *lock) +{ + struct six_lock_count ret; + + ret.n[SIX_LOCK_read] = 0; + ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = lock->state.seq & 1; + + if (!lock->readers) + ret.n[SIX_LOCK_read] += lock->state.read_lock; + else { + int cpu; + + for_each_possible_cpu(cpu) + ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu); + } + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_counts); diff --git a/linux/string.c b/linux/string.c index fd2797e..a32a899 100644 --- a/linux/string.c +++ b/linux/string.c @@ -21,8 +21,10 @@ #include #include +#include #include +#include #include #include @@ -62,6 +64,31 @@ size_t strlcpy(char *dest, const char *src, size_t size) return ret; } +ssize_t strscpy(char *dest, const char *src, size_t count) +{ + long res = 0; + + if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) + return -E2BIG; + + while (count) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Hit buffer length without finding a NUL; force NUL-termination. */ + if (res) + dest[res-1] = '\0'; + + return -E2BIG; +} + void memzero_explicit(void *s, size_t count) { memset(s, 0, count); diff --git a/linux/string_helpers.c b/linux/string_helpers.c new file mode 100644 index 0000000..29c498a --- /dev/null +++ b/linux/string_helpers.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Helpers for formatting and printing strings + * + * Copyright 31 August 2008 James Bottomley + * Copyright (C) 2013, Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * string_get_size - get the size in the specified units + * @size: The size to be converted in blocks + * @blk_size: Size of the block (use 1 for size in bytes) + * @units: units to use (powers of 1000 or 1024) + * @buf: buffer to format to + * @len: length of buffer + * + * This function returns a string formatted to 3 significant figures + * giving the size in the required units. @buf should have room for + * at least 9 bytes and will always be zero terminated. + * + */ +int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, + char *buf, int len) +{ + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, + [STRING_UNITS_2] = units_2, + }; + static const unsigned int divisor[] = { + [STRING_UNITS_10] = 1000, + [STRING_UNITS_2] = 1024, + }; + static const unsigned int rounding[] = { 500, 50, 5 }; + int i = 0, j; + u32 remainder = 0, sf_cap; + char tmp[12]; + const char *unit; + + tmp[0] = '\0'; + + if (blk_size == 0) + size = 0; + if (size == 0) + goto out; + + /* This is Napier's algorithm. Reduce the original block size to + * + * coefficient * divisor[units]^i + * + * we do the reduction so both coefficients are just under 32 bits so + * that multiplying them together won't overflow 64 bits and we keep + * as much precision as possible in the numbers. + * + * Note: it's safe to throw away the remainders here because all the + * precision is in the coefficients. + */ + while (blk_size >> 32) { + do_div(blk_size, divisor[units]); + i++; + } + + while (size >> 32) { + do_div(size, divisor[units]); + i++; + } + + /* now perform the actual multiplication keeping i as the sum of the + * two logarithms */ + size *= blk_size; + + /* and logarithmically reduce it until it's just under the divisor */ + while (size >= divisor[units]) { + remainder = do_div(size, divisor[units]); + i++; + } + + /* work out in j how many digits of precision we need from the + * remainder */ + sf_cap = size; + for (j = 0; sf_cap*10 < 1000; j++) + sf_cap *= 10; + + if (units == STRING_UNITS_2) { + /* express the remainder as a decimal. It's currently the + * numerator of a fraction whose denominator is + * divisor[units], which is 1 << 10 for STRING_UNITS_2 */ + remainder *= 1000; + remainder >>= 10; + } + + /* add a 5 to the digit below what will be printed to ensure + * an arithmetical round up and carry it through to size */ + remainder += rounding[j]; + if (remainder >= 1000) { + remainder -= 1000; + size += 1; + } + + if (j) { + snprintf(tmp, sizeof(tmp), ".%03u", remainder); + tmp[j+1] = '\0'; + } + + out: + if (i >= ARRAY_SIZE(units_2)) + unit = "UNK"; + else + unit = units_str[units][i]; + + return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit); +} +EXPORT_SYMBOL(string_get_size); diff --git a/linux/timer.c b/linux/timer.c index eb93786..7d519a4 100644 --- a/linux/timer.c +++ b/linux/timer.c @@ -93,9 +93,11 @@ do { \ \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ - heap_swap(h, _i, (h)->used); \ - heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used); \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ } while (0) #define heap_pop(h, d, cmp) \ diff --git a/linux/zstd_compress_module.c b/linux/zstd_compress_module.c new file mode 100644 index 0000000..35cc5cb --- /dev/null +++ b/linux/zstd_compress_module.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include +#include +#include +#include + +#define ZSTD_FORWARD_IF_ERR(ret) \ + do { \ + size_t const __ret = (ret); \ + if (ZSTD_isError(__ret)) \ + return __ret; \ + } while (0) + +static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters, + unsigned long long pledged_src_size) +{ + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset( + cctx, ZSTD_reset_session_and_parameters)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize( + cctx, pledged_src_size)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_windowLog, parameters->cParams.windowLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_hashLog, parameters->cParams.hashLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_chainLog, parameters->cParams.chainLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_searchLog, parameters->cParams.searchLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_minMatch, parameters->cParams.minMatch)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_targetLength, parameters->cParams.targetLength)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_strategy, parameters->cParams.strategy)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag)); + return 0; +} + +int zstd_min_clevel(void) +{ + return ZSTD_minCLevel(); +} +EXPORT_SYMBOL(zstd_min_clevel); + +int zstd_max_clevel(void) +{ + return ZSTD_maxCLevel(); +} +EXPORT_SYMBOL(zstd_max_clevel); + +size_t zstd_compress_bound(size_t src_size) +{ + return ZSTD_compressBound(src_size); +} +EXPORT_SYMBOL(zstd_compress_bound); + +zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size) +{ + return ZSTD_getParams(level, estimated_src_size, 0); +} +EXPORT_SYMBOL(zstd_get_params); + +size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) +{ + return ZSTD_estimateCCtxSize_usingCParams(*cparams); +} +EXPORT_SYMBOL(zstd_cctx_workspace_bound); + +zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + return ZSTD_initStaticCCtx(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_cctx); + +size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size, const zstd_parameters *parameters) +{ + ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size)); + return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size); +} +EXPORT_SYMBOL(zstd_compress_cctx); + +size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams) +{ + return ZSTD_estimateCStreamSize_usingCParams(*cparams); +} +EXPORT_SYMBOL(zstd_cstream_workspace_bound); + +zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters, + unsigned long long pledged_src_size, void *workspace, size_t workspace_size) +{ + zstd_cstream *cstream; + + if (workspace == NULL) + return NULL; + + cstream = ZSTD_initStaticCStream(workspace, workspace_size); + if (cstream == NULL) + return NULL; + + /* 0 means unknown in linux zstd API but means 0 in new zstd API */ + if (pledged_src_size == 0) + pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN; + + if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size))) + return NULL; + + return cstream; +} +EXPORT_SYMBOL(zstd_init_cstream); + +size_t zstd_reset_cstream(zstd_cstream *cstream, + unsigned long long pledged_src_size) +{ + return ZSTD_resetCStream(cstream, pledged_src_size); +} +EXPORT_SYMBOL(zstd_reset_cstream); + +size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output, + zstd_in_buffer *input) +{ + return ZSTD_compressStream(cstream, output, input); +} +EXPORT_SYMBOL(zstd_compress_stream); + +size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output) +{ + return ZSTD_flushStream(cstream, output); +} +EXPORT_SYMBOL(zstd_flush_stream); + +size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output) +{ + return ZSTD_endStream(cstream, output); +} +EXPORT_SYMBOL(zstd_end_stream); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Compressor"); diff --git a/linux/zstd_decompress_module.c b/linux/zstd_decompress_module.c new file mode 100644 index 0000000..7e8cd44 --- /dev/null +++ b/linux/zstd_decompress_module.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include +#include +#include +#include + +/* Common symbols. zstd_compress must depend on zstd_decompress. */ + +unsigned int zstd_is_error(size_t code) +{ + return ZSTD_isError(code); +} +EXPORT_SYMBOL(zstd_is_error); + +zstd_error_code zstd_get_error_code(size_t code) +{ + return ZSTD_getErrorCode(code); +} +EXPORT_SYMBOL(zstd_get_error_code); + +const char *zstd_get_error_name(size_t code) +{ + return ZSTD_getErrorName(code); +} +EXPORT_SYMBOL(zstd_get_error_name); + +/* Decompression symbols. */ + +size_t zstd_dctx_workspace_bound(void) +{ + return ZSTD_estimateDCtxSize(); +} +EXPORT_SYMBOL(zstd_dctx_workspace_bound); + +zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + return ZSTD_initStaticDCtx(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_dctx); + +size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size) +{ + return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size); +} +EXPORT_SYMBOL(zstd_decompress_dctx); + +size_t zstd_dstream_workspace_bound(size_t max_window_size) +{ + return ZSTD_estimateDStreamSize(max_window_size); +} +EXPORT_SYMBOL(zstd_dstream_workspace_bound); + +zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace, + size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + (void)max_window_size; + return ZSTD_initStaticDStream(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_dstream); + +size_t zstd_reset_dstream(zstd_dstream *dstream) +{ + return ZSTD_resetDStream(dstream); +} +EXPORT_SYMBOL(zstd_reset_dstream); + +size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + zstd_in_buffer *input) +{ + return ZSTD_decompressStream(dstream, output, input); +} +EXPORT_SYMBOL(zstd_decompress_stream); + +size_t zstd_find_frame_compressed_size(const void *src, size_t src_size) +{ + return ZSTD_findFrameCompressedSize(src, src_size); +} +EXPORT_SYMBOL(zstd_find_frame_compressed_size); + +size_t zstd_get_frame_header(zstd_frame_header *header, const void *src, + size_t src_size) +{ + return ZSTD_getFrameHeader(header, src, src_size); +} +EXPORT_SYMBOL(zstd_get_frame_header); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Decompressor"); diff --git a/nix/overlay.nix b/nix/overlay.nix index 42d3fb2..8138f20 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -4,7 +4,6 @@ final: prev: { tools = final.callPackage ../default.nix { testWithValgrind = false; filter = filter.lib; - lastModified = builtins.substring 0 8 self.lastModifiedDate; versionString = self.version; }; toolsValgrind = final.bcachefs.tools.override { diff --git a/qcow2.c b/qcow2.c index 7cf4992..d01fa94 100644 --- a/qcow2.c +++ b/qcow2.c @@ -94,7 +94,7 @@ void qcow2_write_image(int infd, int outfd, ranges *data, ranges_sort_merge(data); /* Write data: */ - darray_foreach(r, *data) + darray_for_each(*data, r) for (src_offset = r->start; src_offset < r->end; src_offset += block_size) { diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..fc7929d --- /dev/null +++ b/shell.nix @@ -0,0 +1,18 @@ +{ kversion ? "linux_5_15" +, pkgs ? import {} }: + +with pkgs; + +let + tools = pkgs.callPackage ./default.nix { doCheck = false ;} ; +in +mkShell { + buildInputs = [ + linuxKernel.packages.${kversion}.perf + gdb + ccls # code completion in neovim/emacs + ]; + inputsFrom = [ + tools + ]; +} diff --git a/tests/valgrind-suppressions.txt b/tests/valgrind-suppressions.txt index d83e052..612a08e 100644 --- a/tests/valgrind-suppressions.txt +++ b/tests/valgrind-suppressions.txt @@ -15,3 +15,24 @@ ... fun:call_rcu_data_init } +{ + urcu_memb_call_rcu + Memcheck:Leak + match-leak-kinds: possible + ... + fun:pthread_create* + obj:/*/liburcu.so.* + ... + fun:urcu_memb_call_rcu +} +{ + pthread_create + Memcheck:Leak + match-leak-kinds: possible + fun:calloc + ... + fun:allocate_stack + fun:pthread_create* + fun:kthread_create + fun:bch2_rebalance_start +} diff --git a/tools-util.c b/tools-util.c index 9491779..f29d202 100644 --- a/tools-util.c +++ b/tools-util.c @@ -126,63 +126,19 @@ struct stat xstat(const char *path) return statbuf; } -/* Formatting: */ - -int printf_pad(unsigned pad, const char * fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = vprintf(fmt, args); - va_end(args); - - while (ret++ < pad) - putchar(' '); - - return ret; -} +/* File parsing (i.e. sysfs) */ -struct units_buf __pr_units(s64 _v, enum units units) +void write_file_str(int dirfd, const char *path, const char *str) { - struct units_buf ret; - char *out = ret.b, *end = out + sizeof(ret.b); - u64 v = _v; + int fd = xopenat(dirfd, path, O_WRONLY); + ssize_t wrote, len = strlen(str); - if (_v < 0) { - out += scnprintf(out, end - out, "-"); - v = -_v; - } - - switch (units) { - case BYTES: - snprintf(out, end - out, "%llu", v << 9); - break; - case SECTORS: - snprintf(out, end - out, "%llu", v); - break; - case HUMAN_READABLE: - v <<= 9; - - if (v >= 1024) { - int exp = log(v) / log(1024); - snprintf(out, end - out, "%.1f%c", - v / pow(1024, exp), - "KMGTPE"[exp-1]); - } else { - snprintf(out, end - out, "%llu", v); - } - - break; - } - - return ret; + wrote = write(fd, str, len); + if (wrote != len) + die("read error: %m"); + close(fd); } -/* Argument parsing stuff: */ - -/* File parsing (i.e. sysfs) */ - char *read_file_str(int dirfd, const char *path) { int fd = xopenat(dirfd, path, O_RDONLY); @@ -331,22 +287,21 @@ static int range_cmp(const void *_l, const void *_r) void ranges_sort_merge(ranges *r) { struct range *t, *i; - ranges tmp = { NULL }; + ranges tmp = { 0 }; - sort(&darray_item(*r, 0), darray_size(*r), - sizeof(darray_item(*r, 0)), range_cmp, NULL); + sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL); /* Merge contiguous ranges: */ - darray_foreach(i, *r) { - t = tmp.size ? &tmp.item[tmp.size - 1] : NULL; + darray_for_each(*r, i) { + t = tmp.nr ? &tmp.data[tmp.nr - 1] : NULL; if (t && t->end >= i->start) t->end = max(t->end, i->end); else - darray_append(tmp, *i); + darray_push(&tmp, *i); } - darray_free(*r); + darray_exit(r); *r = tmp; } @@ -354,7 +309,7 @@ void ranges_roundup(ranges *r, unsigned block_size) { struct range *i; - darray_foreach(i, *r) { + darray_for_each(*r, i) { i->start = round_down(i->start, block_size); i->end = round_up(i->end, block_size); } @@ -364,7 +319,7 @@ void ranges_rounddown(ranges *r, unsigned block_size) { struct range *i; - darray_foreach(i, *r) { + darray_for_each(*r, i) { i->start = round_up(i->start, block_size); i->end = round_down(i->end, block_size); i->end = max(i->end, i->start); diff --git a/tools-util.h b/tools-util.h index 9468f07..d1122f5 100644 --- a/tools-util.h +++ b/tools-util.h @@ -18,7 +18,7 @@ #include #include #include -#include "ccan/darray/darray.h" +#include "libbcachefs/darray.h" #define noreturn __attribute__((noreturn)) @@ -53,22 +53,7 @@ struct stat xstat(const char *); _ret; \ }) -int printf_pad(unsigned pad, const char * fmt, ...); - -enum units { - BYTES, - SECTORS, - HUMAN_READABLE, -}; - -struct units_buf __pr_units(s64, enum units); - -struct units_buf { - char b[20]; -}; - -#define pr_units(_v, _u) &(__pr_units(_v, _u).b[0]) - +void write_file_str(int, const char *, const char *); char *read_file_str(int, const char *); u64 read_file_u64(int, const char *); @@ -86,14 +71,14 @@ struct range { u64 end; }; -typedef darray(struct range) ranges; +typedef DARRAY(struct range) ranges; static inline void range_add(ranges *data, u64 offset, u64 size) { - darray_append(*data, (struct range) { + darray_push(data, ((struct range) { .start = offset, .end = offset + size - }); + })); } void ranges_sort_merge(ranges *); @@ -109,9 +94,9 @@ struct hole_iter { static inline struct range hole_iter_next(struct hole_iter *iter) { struct range r = { - .start = iter->idx ? iter->r.item[iter->idx - 1].end : 0, - .end = iter->idx < iter->r.size - ? iter->r.item[iter->idx].start : iter->end, + .start = iter->idx ? iter->r.data[iter->idx - 1].end : 0, + .end = iter->idx < iter->r.nr + ? iter->r.data[iter->idx].start : iter->end, }; BUG_ON(r.start > r.end); @@ -122,7 +107,7 @@ static inline struct range hole_iter_next(struct hole_iter *iter) #define for_each_hole(_iter, _ranges, _end, _i) \ for (_iter = (struct hole_iter) { .r = _ranges, .end = _end }; \ - (_iter.idx <= _iter.r.size && \ + (_iter.idx <= _iter.r.nr && \ (_i = hole_iter_next(&_iter), true));) #include -- 2.39.2