]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
New upstream release
authorJonathan Carter <jcc@debian.org>
Mon, 31 Oct 2022 09:53:37 +0000 (11:53 +0200)
committerJonathan Carter <jcc@debian.org>
Mon, 31 Oct 2022 09:53:37 +0000 (11:53 +0200)
211 files changed:
Makefile
Makefile.compiler
bcachefs.8
bcachefs.c
ccan/darray/LICENSE [deleted file]
ccan/darray/_info [deleted file]
ccan/darray/darray.h [deleted file]
cmd_attr.c
cmd_data.c
cmd_device.c
cmd_dump.c [new file with mode: 0644]
cmd_format.c
cmd_fs.c
cmd_key.c
cmd_list.c [moved from cmd_debug.c with 57% similarity]
cmd_list_journal.c [new file with mode: 0644]
cmd_migrate.c
cmd_option.c [new file with mode: 0644]
cmds.h
crypto.c
crypto.h
debian/bcachefs-tools.postinst [new file with mode: 0644]
debian/bcachefs-tools.postrm [new file with mode: 0644]
debian/changelog
debian/control
debian/files
default.nix
include/linux/bio.h
include/linux/bitops.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/bug.h
include/linux/errname.h [new file with mode: 0644]
include/linux/freezer.h
include/linux/generic-radix-tree.h
include/linux/jiffies.h
include/linux/kernel.h
include/linux/kmemleak.h [new file with mode: 0644]
include/linux/kobject.h
include/linux/list.h
include/linux/mean_and_variance.h [new file with mode: 0644]
include/linux/mm.h [new file with mode: 0644]
include/linux/prandom.h [new file with mode: 0644]
include/linux/prefetch.h
include/linux/pretty-printers.h [new file with mode: 0644]
include/linux/printbuf.h [new file with mode: 0644]
include/linux/printk.h
include/linux/random.h
include/linux/rwsem.h
include/linux/sched.h
include/linux/shrinker.h
include/linux/six.h
include/linux/slab.h
include/linux/spinlock.h
include/linux/string.h
include/linux/string_helpers.h [new file with mode: 0644]
include/linux/sysfs.h
include/linux/types.h
include/linux/vmalloc.h
include/linux/zstd.h
include/linux/zstd_errors.h [new file with mode: 0644]
include/trace/events/bcachefs.h
libbcachefs.c
libbcachefs.h
libbcachefs/acl.c
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/backpointers.c [new file with mode: 0644]
libbcachefs/backpointers.h [new file with mode: 0644]
libbcachefs/bbpos.h [new file with mode: 0644]
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_buf.h
libbcachefs/bkey_cmp.h [new file with mode: 0644]
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bkey_sort.c
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.c [new file with mode: 0644]
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/clock.c
libbcachefs/compress.c
libbcachefs/counters.c [new file with mode: 0644]
libbcachefs/counters.h [new file with mode: 0644]
libbcachefs/darray.h [new file with mode: 0644]
libbcachefs/data_update.c [new file with mode: 0644]
libbcachefs/data_update.h [new file with mode: 0644]
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/disk_groups.c
libbcachefs/disk_groups.h
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/errcode.c [new file with mode: 0644]
libbcachefs/errcode.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/extent_update.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-common.c
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_sb.c [new file with mode: 0644]
libbcachefs/journal_sb.h [new file with mode: 0644]
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/keylist.c
libbcachefs/lru.c [new file with mode: 0644]
libbcachefs/lru.h [new file with mode: 0644]
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/movinggc.h
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/quota.h
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/siphash.c
libbcachefs/str_hash.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/subvolume_types.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/sysfs.h
libbcachefs/tests.c
libbcachefs/trace.c
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c
libbcachefs/vstructs.h
libbcachefs/xattr.c
libbcachefs/xattr.h
linux/bio.c
linux/blkdev.c
linux/generic-radix-tree.c
linux/int_sqrt.c [new file with mode: 0644]
linux/kthread.c
linux/mean_and_variance.c [new file with mode: 0644]
linux/pretty-printers.c [new file with mode: 0644]
linux/printbuf.c [new file with mode: 0644]
linux/printbuf_userspace.c [new file with mode: 0644]
linux/ratelimit.c [new file with mode: 0644]
linux/shrinker.c
linux/six.c
linux/string.c
linux/string_helpers.c [new file with mode: 0644]
linux/timer.c
linux/zstd_compress_module.c [new file with mode: 0644]
linux/zstd_decompress_module.c [new file with mode: 0644]
nix/overlay.nix
qcow2.c
shell.nix [new file with mode: 0644]
tests/valgrind-suppressions.txt
tools-util.c
tools-util.h

index e49534e6c66323504c037119bbf509efa860f6b2..d460a6d3d0b3e6ab742ec670f950b65c2fb403ad 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ PREFIX?=/usr/local
 PKG_CONFIG?=pkg-config
 INSTALL=install
 
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC                             \
+CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC                     \
        -Wno-pointer-sign                                       \
        -fno-strict-aliasing                                    \
        -fno-delete-null-pointer-checks                         \
@@ -47,7 +47,7 @@ CFLAGS+=$(call cc-disable-warning, zero-length-array)
 CFLAGS+=$(call cc-disable-warning, shift-overflow)
 CFLAGS+=$(call cc-disable-warning, enum-conversion)
 
-PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev"
+PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev libkeyutils"
 ifdef BCACHEFS_FUSE
        PKGCONFIG_LIBS+="fuse3 >= 3.7"
        CFLAGS+=-DBCACHEFS_FUSE
@@ -189,6 +189,22 @@ update-bcachefs-sources:
        git add include/linux/list_nulls.h
        cp $(LINUX_DIR)/include/linux/poison.h include/linux/
        git add include/linux/poison.h
+       cp $(LINUX_DIR)/include/linux/generic-radix-tree.h include/linux/
+       git add include/linux/generic-radix-tree.h
+       cp $(LINUX_DIR)/lib/generic-radix-tree.c linux/
+       git add linux/generic-radix-tree.c
+       cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/
+       git add include/linux/kmemleak.h
+       cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/
+       git add include/linux/printbuf.h
+       cp $(LINUX_DIR)/lib/printbuf.c linux/
+       git add linux/printbuf.c
+       cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
+       git add linux/mean_and_variance.c
+       cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
+       git add include/linux/mean_and_variance.h
+       cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
+       git add linux/int_sqrt.c
        cp $(LINUX_DIR)/scripts/Makefile.compiler ./
        git add Makefile.compiler
        $(RM) libbcachefs/*.mod.c
index 86ecd2ac874c394104e2c7d2aa32735941ba8c0b..94d0d40cddb3d614facd505e46d4d83cbc7f7e10 100644 (file)
@@ -21,8 +21,8 @@ TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$
 # automatically cleaned up.
 try-run = $(shell set -e;              \
        TMP=$(TMPOUT)/tmp;              \
-       mkdir -p $(TMPOUT);             \
        trap "rm -rf $(TMPOUT)" EXIT;   \
+       mkdir -p $(TMPOUT);             \
        if ($(1)) >/dev/null 2>&1;      \
        then echo "$(2)";               \
        else echo "$(3)";               \
index 874068c8e95d313832a725e6ba5e589d42d8ed51..d5c4e8906e793a84939daa2b35a9253abd04acb3 100644 (file)
@@ -99,7 +99,7 @@ Format one or a list of devices with bcachefs data structures.
 You need to do this before you create a volume.
 .Pp
 Device specific options must come before corresponding devices, e.g.
-.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb
+.Dl bcachefs format --label=ssd /dev/sda --label=hdd /dev/sdb
 .Bl -tag -width Ds
 .It Fl b , Fl -block Ns = Ns Ar size
 block size, in bytes (e.g. 4k)
@@ -231,8 +231,9 @@ Force, if data redundancy will be degraded
 .El
 .It Nm Ic device Ic evacuate Ar device
 Move data off of a given device
-.It Nm Ic device Ic set-state Oo Ar options Oc Ar device Ar new-state
+.It Nm Ic device Ic set-state Oo Ar options Oc Ar new-state Ar device
 .Bl -tag -width Ds
+.It Ar  new-state Ns = Ns ( Ar rw | ro | failed | spare )
 .It Fl f , Fl -force
 Force, if data redundancy will be degraded
 .El
index 4f2cd55111a2285123eabcee7852598a7830837b..31d96287f7dc8343ef5b0f80f71cb3116ae282fc 100644 (file)
@@ -33,6 +33,7 @@ static void usage(void)
             "Superblock commands:\n"
             "  format                   Format a new filesystem\n"
             "  show-super               Dump superblock information to stdout\n"
+            "  set-option               Set a filesystem option\n"
             "\n"
             "Repair:\n"
             "  fsck                     Check an existing filesystem for errors\n"
@@ -59,9 +60,9 @@ static void usage(void)
             "  device resize-journal    Resize journal on a device\n"
             "\n"
             "Commands for managing subvolumes and snapshots:\n"
-            "  subvolume create     Create a new subvolume\n"
-            "  subvolume delete     Delete an existing subvolume\n"
-            "  subvolume snapshot   Create a snapshot\n"
+            "  subvolume create         Create a new subvolume\n"
+            "  subvolume delete         Delete an existing subvolume\n"
+            "  subvolume snapshot       Create a snapshot\n"
             "\n"
             "Commands for managing filesystem data:\n"
             "  data rereplicate         Rereplicate degraded data\n"
@@ -199,6 +200,8 @@ int main(int argc, char *argv[])
                return cmd_version(argc, argv);
        if (!strcmp(cmd, "show-super"))
                return cmd_show_super(argc, argv);
+       if (!strcmp(cmd, "set-option"))
+               return cmd_set_option(argc, argv);
 
        if (argc < 2) {
                printf("%s: missing command\n", argv[0]);
@@ -235,6 +238,8 @@ int main(int argc, char *argv[])
                return cmd_list(argc, argv);
        if (!strcmp(cmd, "list_journal"))
                return cmd_list_journal(argc, argv);
+       if (!strcmp(cmd, "kill_btree_node"))
+               return cmd_kill_btree_node(argc, argv);
 
        if (!strcmp(cmd, "setattr"))
                return cmd_setattr(argc, argv);
diff --git a/ccan/darray/LICENSE b/ccan/darray/LICENSE
deleted file mode 100644 (file)
index 89de354..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/ccan/darray/_info b/ccan/darray/_info
deleted file mode 100644 (file)
index b6d5e4b..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include <stdio.h>
-#include <string.h>
-
-#include "ccan/darray/darray.h"
-
-/**
- * darray - Generic resizable arrays
- *
- * darray is a set of macros for managing dynamically-allocated arrays.
- * It removes the tedium of managing realloc'd arrays with pointer, size, and
- * allocated size.
- *
- * Example:
- * #include <ccan/darray/darray.h>
- * #include <stdio.h>
- * 
- * int main(void) {
- *     darray(int) numbers = darray_new();
- *     char buffer[32];
- *     
- *     for (;;) {
- *             int *i;
- *             darray_foreach(i, numbers)
- *                     printf("%d ", *i);
- *             if (darray_size(numbers) > 0)
- *                     puts("");
- *             
- *             printf("darray> ");
- *             fgets(buffer, sizeof(buffer), stdin);
- *             if (*buffer == '\0' || *buffer == '\n')
- *                     break;
- *             
- *             darray_append(numbers, atoi(buffer));
- *     }
- *     
- *     darray_free(numbers);
- *     
- *     return 0;
- * }
- *
- * Author: Joey Adams <joeyadams3.14159@gmail.com>
- * License: MIT
- * Version: 0.2
- */
-int main(int argc, char *argv[])
-{
-       if (argc != 2)
-               return 1;
-
-       if (strcmp(argv[1], "depends") == 0) {
-               /* Nothing. */
-               return 0;
-       }
-
-       return 1;
-}
diff --git a/ccan/darray/darray.h b/ccan/darray/darray.h
deleted file mode 100644 (file)
index 7511241..0000000
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef CCAN_DARRAY_H
-#define CCAN_DARRAY_H
-
-#include <stdlib.h>
-#include <string.h>
-#include "config.h"
-
-/*
- * SYNOPSIS
- *
- * Life cycle of a darray (dynamically-allocated array):
- *
- *     darray(int) a = darray_new();
- *     darray_free(a);
- *
- *     struct {darray(int) a;} foo;
- *     darray_init(foo.a);
- *     darray_free(foo.a);
- *
- * Typedefs for darrays of common types:
- *
- *     darray_char, darray_schar, darray_uchar
- *     darray_short, darray_int, darray_long
- *     darray_ushort, darray_uint, darray_ulong
- *
- * Access:
- *
- *     T      darray_item(darray(T) arr, size_t index);
- *     size_t darray_size(darray(T) arr);
- *     size_t darray_alloc(darray(T) arr);
- *     bool   darray_empty(darray(T) arr);
- *
- * Insertion (single item):
- *
- *     void   darray_append(darray(T) arr, T item);
- *     void   darray_prepend(darray(T) arr, T item);
- *     void   darray_push(darray(T) arr, T item); // same as darray_append
- *
- * Insertion (multiple items):
- *
- *     void   darray_append_items(darray(T) arr, T *items, size_t count);
- *     void   darray_prepend_items(darray(T) arr, T *items, size_t count);
- *
- *     void   darray_appends(darray(T) arr, [T item, [...]]);
- *     void   darray_prepends(darray(T) arr, [T item, [...]]);
- *
- *     // Same functionality as above, but does not require typeof.
- *     void   darray_appends_t(darray(T) arr, #T, [T item, [...]]);
- *     void   darray_prepends_t(darray(T) arr, #T, [T item, [...]]);
- *
- * Removal:
- *
- *     T      darray_pop(darray(T) arr | darray_size(arr) != 0);
- *     T*     darray_pop_check(darray(T*) arr);
- *     void   darray_remove(darray(T) arr, size_t index);
- *
- * Replacement:
- *
- *     void   darray_from_items(darray(T) arr, T *items, size_t count);
- *     void   darray_from_c(darray(T) arr, T c_array[N]);
- *
- * String buffer:
- *
- *     void   darray_append_string(darray(char) arr, const char *str);
- *     void   darray_append_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- *     void   darray_prepend_string(darray(char) arr, const char *str);
- *     void   darray_prepend_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- *     void   darray_from_string(darray(T) arr, const char *str);
- *     void   darray_from_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- * Size management:
- *
- *     void   darray_resize(darray(T) arr, size_t newSize);
- *     void   darray_resize0(darray(T) arr, size_t newSize);
- *
- *     void   darray_realloc(darray(T) arr, size_t newAlloc);
- *     void   darray_growalloc(darray(T) arr, size_t newAlloc);
- *
- *     void   darray_make_room(darray(T) arr, size_t room);
- *
- * Traversal:
- *
- *     darray_foreach(T *&i, darray(T) arr) {...}
- *     darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Except for darray_foreach, darray_foreach_reverse, and darray_remove,
- * all macros evaluate their non-darray arguments only once.
- */
-
-/*** Life cycle ***/
-
-#define darray(type) struct {type *item; size_t size; size_t alloc;}
-
-#define darray_new() {0,0,0}
-#define darray_init(arr) do {(arr).item=0; (arr).size=0; (arr).alloc=0;} while(0)
-#define darray_free(arr) do {free((arr).item);} while(0)
-
-
-/*
- * Typedefs for darrays of common types.  These are useful
- * when you want to pass a pointer to an darray(T) around.
- *
- * The following will produce an incompatible pointer warning:
- *
- *     void foo(darray(int) *arr);
- *     darray(int) arr = darray_new();
- *     foo(&arr);
- *
- * The workaround:
- *
- *     void foo(darray_int *arr);
- *     darray_int arr = darray_new();
- *     foo(&arr);
- */
-
-typedef darray(char)           darray_char;
-typedef darray(signed char)    darray_schar;
-typedef darray(unsigned char)  darray_uchar;
-
-typedef darray(short)          darray_short;
-typedef darray(int)            darray_int;
-typedef darray(long)           darray_long;
-
-typedef darray(unsigned short) darray_ushort;
-typedef darray(unsigned int)   darray_uint;
-typedef darray(unsigned long)  darray_ulong;
-
-
-/*** Access ***/
-
-#define darray_item(arr, i) ((arr).item[i])
-#define darray_size(arr)    ((arr).size)
-#define darray_alloc(arr)   ((arr).alloc)
-#define darray_empty(arr)   ((arr).size == 0)
-
-
-/*** Insertion (single item) ***/
-
-#define darray_append(arr, ...) do { \
-               darray_resize(arr, (arr).size+1); \
-               (arr).item[(arr).size-1] = (__VA_ARGS__); \
-       } while(0)
-#define darray_prepend(arr, ...) do { \
-               darray_resize(arr, (arr).size+1); \
-               memmove((arr).item+1, (arr).item, ((arr).size-1)*sizeof(*(arr).item)); \
-               (arr).item[0] = (__VA_ARGS__); \
-       } while(0)
-#define darray_push(arr, ...) darray_append(arr, __VA_ARGS__)
-
-
-/*** Insertion (multiple items) ***/
-
-#define darray_append_items(arr, items, count) do { \
-               size_t __count = (count), __oldSize = (arr).size; \
-               darray_resize(arr, __oldSize + __count); \
-               memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
-       } while(0)
-
-#define darray_prepend_items(arr, items, count) do { \
-               size_t __count = (count), __oldSize = (arr).size; \
-               darray_resize(arr, __count + __oldSize); \
-               memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
-               memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
-       } while(0)
-
-#define darray_append_items_nullterminate(arr, items, count) do { \
-               size_t __count = (count), __oldSize = (arr).size; \
-               darray_resize(arr, __oldSize + __count + 1); \
-               memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
-               (arr).item[--(arr).size] = 0; \
-       } while(0)
-
-#define darray_prepend_items_nullterminate(arr, items, count) do { \
-               size_t __count = (count), __oldSize = (arr).size; \
-               darray_resize(arr, __count + __oldSize + 1); \
-               memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
-               memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
-               (arr).item[--(arr).size] = 0; \
-       } while(0)
-
-#if HAVE_TYPEOF
-#define darray_appends(arr, ...) darray_appends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#define darray_prepends(arr, ...) darray_prepends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#endif
-
-#define darray_appends_t(arr, type, ...) do { \
-               type __src[] = {__VA_ARGS__}; \
-               darray_append_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
-       } while(0)
-#define darray_prepends_t(arr, type, ...) do { \
-               type __src[] = {__VA_ARGS__}; \
-               darray_prepend_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
-       } while(0)
-
-
-/*** Removal ***/
-
-/* Warning: Do not call darray_pop on an empty darray. */
-#define darray_pop(arr) ((arr).item[--(arr).size])
-#define darray_pop_check(arr) ((arr).size ? darray_pop(arr) : NULL)
-/* Warning, slow: Requires copying all elements after removed item. */
-#define darray_remove(arr, index) do { \
-       if (index < arr.size-1)    \
-               memmove(&(arr).item[index], &(arr).item[index+1], ((arr).size-1-i)*sizeof(*(arr).item)); \
-       (arr).size--;  \
-       } while(0)
-
-
-/*** Replacement ***/
-
-#define darray_from_items(arr, items, count) do {size_t __count = (count); darray_resize(arr, __count); memcpy((arr).item, items, __count*sizeof(*(arr).item));} while(0)
-#define darray_from_c(arr, c_array) darray_from_items(arr, c_array, sizeof(c_array)/sizeof(*(c_array)))
-
-
-/*** String buffer ***/
-
-#define darray_append_string(arr, str) do {const char *__str = (str); darray_append_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_append_lit(arr, stringLiteral) do {darray_append_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-#define darray_prepend_string(arr, str) do { \
-               const char *__str = (str); \
-               darray_prepend_items_nullterminate(arr, __str, strlen(__str)); \
-       } while(0)
-#define darray_prepend_lit(arr, stringLiteral) \
-       darray_prepend_items_nullterminate(arr, stringLiteral, sizeof(stringLiteral) - 1)
-
-#define darray_from_string(arr, str) do {const char *__str = (str); darray_from_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_from_lit(arr, stringLiteral) do {darray_from_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-
-/*** Size management ***/
-
-#define darray_resize(arr, newSize) darray_growalloc(arr, (arr).size = (newSize))
-#define darray_resize0(arr, newSize) do { \
-               size_t __oldSize = (arr).size, __newSize = (newSize); \
-               (arr).size = __newSize; \
-               if (__newSize > __oldSize) { \
-                       darray_growalloc(arr, __newSize); \
-                       memset(&(arr).item[__oldSize], 0, (__newSize - __oldSize) * sizeof(*(arr).item)); \
-               } \
-       } while(0)
-
-#define darray_realloc(arr, newAlloc) do { \
-               (arr).item = realloc((arr).item, ((arr).alloc = (newAlloc)) * sizeof(*(arr).item)); \
-       } while(0)
-#define darray_growalloc(arr, need) do { \
-               size_t __need = (need); \
-               if (__need > (arr).alloc) \
-                       darray_realloc(arr, darray_next_alloc((arr).alloc, __need)); \
-       } while(0)
-
-#if HAVE_STATEMENT_EXPR==1
-#define darray_make_room(arr, room) ({size_t newAlloc = (arr).size+(room); if ((arr).alloc<newAlloc) darray_realloc(arr, newAlloc); (arr).item+(arr).size; })
-#endif
-
-static inline size_t darray_next_alloc(size_t alloc, size_t need)
-{
-       if (alloc == 0)
-               alloc = 1;
-       while (alloc < need)
-               alloc *= 2;
-       return alloc;
-}
-
-
-/*** Traversal ***/
-
-/*
- * darray_foreach(T *&i, darray(T) arr) {...}
- *
- * Traverse a darray.  `i` must be declared in advance as a pointer to an item.
- */
-#define darray_foreach(i, arr) \
-       for ((i) = &(arr).item[0]; (i) < &(arr).item[(arr).size]; (i)++)
-
-/*
- * darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Like darray_foreach, but traverse in reverse order.
- */
-#define darray_foreach_reverse(i, arr) \
-       for ((i) = &(arr).item[(arr).size]; (i)-- > &(arr).item[0]; )
-
-
-#endif /* CCAN_DARRAY_H */
-
-/*
-
-darray_growalloc(arr, newAlloc) sees if the darray can currently hold newAlloc items;
-       if not, it increases the alloc to satisfy this requirement, allocating slack
-       space to avoid having to reallocate for every size increment.
-
-darray_from_string(arr, str) copies a string to an darray_char.
-
-darray_push(arr, item) pushes an item to the end of the darray.
-darray_pop(arr) pops it back out.  Be sure there is at least one item in the darray before calling.
-darray_pop_check(arr) does the same as darray_pop, but returns NULL if there are no more items left in the darray.
-
-darray_make_room(arr, room) ensures there's 'room' elements of space after the end of the darray, and it returns a pointer to this space.
-Currently requires HAVE_STATEMENT_EXPR, but I plan to remove this dependency by creating an inline function.
-
-The following require HAVE_TYPEOF==1 :
-
-darray_appends(arr, item0, item1...) appends a collection of comma-delimited items to the darray.
-darray_prepends(arr, item0, item1...) prepends a collection of comma-delimited items to the darray.\
-
-
-Examples:
-
-       darray(int)  arr;
-       int        *i;
-       
-       darray_appends(arr, 0,1,2,3,4);
-       darray_appends(arr, -5,-4,-3,-2,-1);
-       darray_foreach(i, arr)
-               printf("%d ", *i);
-       printf("\n");
-       
-       darray_free(arr);
-       
-
-       typedef struct {int n,d;} Fraction;
-       darray(Fraction) fractions;
-       Fraction        *i;
-       
-       darray_appends(fractions, {3,4}, {3,5}, {2,1});
-       darray_foreach(i, fractions)
-               printf("%d/%d\n", i->n, i->d);
-       
-       darray_free(fractions);
-*/
index 736554c76383f646c6194a85f301f0c70fe8e25f..9e7f56398a5e69812506f97181e5b79931a2aab5 100644 (file)
@@ -87,7 +87,7 @@ static void setattr_usage(void)
 
        bch2_opts_usage(OPT_INODE);
        puts("  -h            Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_setattr(int argc, char *argv[])
index d78598d5abb7ce7cefa478461e24351c1195376d..160eb918b8947a6e7d265860e26e7e55f91e17af 100644 (file)
@@ -18,7 +18,7 @@ int data_usage(void)
             "  rereplicate                     Rereplicate degraded data\n"
             "  job                             Kick off low level data jobs\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        return 0;
 }
 
@@ -32,7 +32,7 @@ static void data_rereplicate_usage(void)
             "\n"
             "Options:\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
@@ -77,7 +77,7 @@ static void data_job_usage(void)
             "  -s inode:offset       start position\n"
             "  -e inode:offset       end position\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
index ef2dfa14654d2a3c6d5f37f48678f611b9c5fc3c..e3c5d513bf3026111ca5c49d68a1744041326172 100644 (file)
@@ -53,7 +53,7 @@ static void device_add_usage(void)
             "  -f, --force                 Use device even if it appears to already be formatted\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_device_add(int argc, char *argv[])
@@ -147,7 +147,7 @@ static void device_remove_usage(void)
             "  -F, --force-metadata        Force removal, even if some metadata\n"
             "                              couldn't be migrated\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
@@ -214,7 +214,7 @@ static void device_online_usage(void)
             "Options:\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_device_online(int argc, char *argv[])
@@ -251,7 +251,7 @@ static void device_offline_usage(void)
             "  -f, --force                 Force, if data redundancy will be degraded\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_device_offline(int argc, char *argv[])
@@ -295,7 +295,7 @@ static void device_evacuate_usage(void)
             "Options:\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_device_evacuate(int argc, char *argv[])
@@ -350,7 +350,7 @@ static void device_set_state_usage(void)
             "      --force-if-data-lost    Force, if data will be lost\n"
             "  -o, --offline               Set state of an offline device\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
@@ -418,9 +418,12 @@ int cmd_device_set_state(int argc, char *argv[])
 
                le64_add_cpu(&sb.sb->seq, 1);
 
-               bch2_super_write(sb.bdev->bd_fd, sb.sb);
+               bch2_super_write(sb.bdev->bd_buffered_fd, sb.sb);
+               ret = fsync(sb.bdev->bd_buffered_fd);
+               if (ret)
+                       fprintf(stderr, "error writing superblock: fsync error (%m)");
                bch2_free_super(&sb);
-               return 0;
+               return ret;
        }
 
        char *fs_path = arg_pop();
@@ -451,7 +454,7 @@ static void device_resize_usage(void)
             "\n"
             "Options:\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
@@ -559,7 +562,7 @@ static void device_resize_journal_usage(void)
             "\n"
             "Options:\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
diff --git a/cmd_dump.c b/cmd_dump.c
new file mode 100644 (file)
index 0000000..4e3d721
--- /dev/null
@@ -0,0 +1,182 @@
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "qcow2.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/extents.h"
+#include "libbcachefs/super.h"
+
+static void dump_usage(void)
+{
+       puts("bcachefs dump - dump filesystem metadata\n"
+            "Usage: bcachefs dump [OPTION]... <devices>\n"
+            "\n"
+            "Options:\n"
+            "  -o output     Output qcow2 image(s)\n"
+            "  -f            Force; overwrite when needed\n"
+            "  -j            Dump entire journal, not just dirty entries\n"
+            "  -h            Display this help and exit\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
+                           bool entire_journal)
+{
+       struct bch_sb *sb = ca->disk_sb.sb;
+       ranges data = { 0 };
+       unsigned i;
+       int ret;
+
+       /* Superblock: */
+       range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+                 sizeof(struct bch_sb_layout));
+
+       for (i = 0; i < sb->layout.nr_superblocks; i++)
+               range_add(&data,
+                         le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+                         vstruct_bytes(sb));
+
+       /* Journal: */
+       for (i = 0; i < ca->journal.nr; i++)
+               if (entire_journal ||
+                   ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
+                       u64 bucket = ca->journal.buckets[i];
+
+                       range_add(&data,
+                                 bucket_bytes(ca) * bucket,
+                                 bucket_bytes(ca));
+               }
+
+       /* Btree: */
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               const struct bch_extent_ptr *ptr;
+               struct bkey_ptrs_c ptrs;
+               struct btree_trans trans;
+               struct btree_iter iter;
+               struct btree *b;
+
+               bch2_trans_init(&trans, c, 0, 0);
+
+               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+                       struct btree_node_iter iter;
+                       struct bkey u;
+                       struct bkey_s_c k;
+
+                       for_each_btree_node_key_unpack(b, k, &iter, &u) {
+                               ptrs = bch2_bkey_ptrs_c(k);
+
+                               bkey_for_each_ptr(ptrs, ptr)
+                                       if (ptr->dev == ca->dev_idx)
+                                               range_add(&data,
+                                                         ptr->offset << 9,
+                                                         btree_bytes(c));
+                       }
+               }
+
+               if (ret)
+                       die("error %s walking btree nodes", strerror(-ret));
+
+               b = c->btree_roots[i].b;
+               if (!btree_node_fake(b)) {
+                       ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+
+                       bkey_for_each_ptr(ptrs, ptr)
+                               if (ptr->dev == ca->dev_idx)
+                                       range_add(&data,
+                                                 ptr->offset << 9,
+                                                 btree_bytes(c));
+               }
+
+               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_exit(&trans);
+       }
+
+       qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
+                         max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
+       darray_exit(&data);
+}
+
+int cmd_dump(int argc, char *argv[])
+{
+       struct bch_opts opts = bch2_opts_empty();
+       struct bch_dev *ca;
+       char *out = NULL;
+       unsigned i, nr_devices = 0;
+       bool force = false, entire_journal = false;
+       int fd, opt;
+
+       opt_set(opts, nochanges,        true);
+       opt_set(opts, norecovery,       true);
+       opt_set(opts, degraded,         true);
+       opt_set(opts, errors,           BCH_ON_ERROR_continue);
+       opt_set(opts, fix_errors,       FSCK_OPT_NO);
+
+       while ((opt = getopt(argc, argv, "o:fjvh")) != -1)
+               switch (opt) {
+               case 'o':
+                       out = optarg;
+                       break;
+               case 'f':
+                       force = true;
+                       break;
+               case 'j':
+                       entire_journal = true;
+                       break;
+               case 'v':
+                       opt_set(opts, verbose, true);
+                       break;
+               case 'h':
+                       dump_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       if (!out)
+               die("Please supply output filename");
+
+       if (!argc)
+               die("Please supply device(s) to check");
+
+       struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+       if (IS_ERR(c))
+               die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+       down_read(&c->gc_lock);
+
+       for_each_online_member(ca, c, i)
+               nr_devices++;
+
+       BUG_ON(!nr_devices);
+
+       for_each_online_member(ca, c, i) {
+               int flags = O_WRONLY|O_CREAT|O_TRUNC;
+
+               if (!force)
+                       flags |= O_EXCL;
+
+               if (!c->devs[i])
+                       continue;
+
+               char *path = nr_devices > 1
+                       ? mprintf("%s.%u.qcow2", out, i)
+                       : mprintf("%s.qcow2", out);
+               fd = xopen(path, flags, 0600);
+               free(path);
+
+               dump_one_device(c, ca, fd, entire_journal);
+               close(fd);
+       }
+
+       up_read(&c->gc_lock);
+
+       bch2_fs_stop(c);
+       return 0;
+}
index cc16b31fabca66991bbb7d29172303bce4c685d8..4debc285f9101a3df0e8e0c474fbce43a9d00814 100644 (file)
 
 #include <uuid/uuid.h>
 
-#include "ccan/darray/darray.h"
-
 #include "cmds.h"
 #include "libbcachefs.h"
 #include "crypto.h"
+#include "libbcachefs/darray.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 #include "libbcachefs/util.h"
@@ -46,6 +45,7 @@ x(0,  version,                required_argument)      \
 x(0,   no_initialize,          no_argument)            \
 x('f', force,                  no_argument)            \
 x('q', quiet,                  no_argument)            \
+x('v', verbose,                no_argument)            \
 x('h', help,                   no_argument)
 
 static void usage(void)
@@ -73,12 +73,13 @@ static void usage(void)
             "\n"
             "  -f, --force\n"
             "  -q, --quiet                 Only print errors\n"
+            "  -v, --verbose               Verbose filesystem initialization\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
             "Device specific options must come before corresponding devices, e.g.\n"
             "  bcachefs format --label cache /dev/sdb /dev/sdc\n"
             "\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 enum {
@@ -112,23 +113,20 @@ u64 read_flag_list_or_die(char *opt, const char * const list[],
 
 int cmd_format(int argc, char *argv[])
 {
-       darray(struct dev_opts) devices;
-       darray(char *) device_paths;
+       DARRAY(struct dev_opts) devices = { 0 };
+       DARRAY(char *) device_paths = { 0 };
        struct format_opts opts = format_opts_default();
        struct dev_opts dev_opts = dev_opts_default(), *dev;
-       bool force = false, no_passphrase = false, quiet = false, initialize = true;
+       bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false;
        unsigned v;
        int opt;
 
-       darray_init(devices);
-       darray_init(device_paths);
-
        struct bch_opt_strs fs_opt_strs =
                bch2_cmdline_opts_get(&argc, argv, OPT_FORMAT);
        struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
 
        while ((opt = getopt_long(argc, argv,
-                                 "-L:U:g:fqh",
+                                 "-L:U:g:fqhv",
                                  format_opts,
                                  NULL)) != -1)
                switch (opt) {
@@ -199,15 +197,17 @@ int cmd_format(int argc, char *argv[])
                        initialize = false;
                        break;
                case O_no_opt:
-                       darray_append(device_paths, optarg);
+                       darray_push(&device_paths, optarg);
                        dev_opts.path = optarg;
-                       darray_append(devices, dev_opts);
+                       darray_push(&devices, dev_opts);
                        dev_opts.size = 0;
                        break;
                case O_quiet:
                case 'q':
                        quiet = true;
                        break;
+               case 'v':
+                       verbose = true;
                case O_help:
                case 'h':
                        usage();
@@ -218,7 +218,7 @@ int cmd_format(int argc, char *argv[])
                        break;
                }
 
-       if (darray_empty(devices))
+       if (!devices.nr)
                die("Please supply a device");
 
        if (opts.encrypted && !no_passphrase) {
@@ -226,18 +226,26 @@ int cmd_format(int argc, char *argv[])
                initialize = false;
        }
 
-       darray_foreach(dev, devices)
+       darray_for_each(devices, dev)
                dev->fd = open_for_format(dev->path, force);
 
        struct bch_sb *sb =
                bch2_format(fs_opt_strs,
                            fs_opts,
                            opts,
-                           devices.item, darray_size(devices));
+                           devices.data, devices.nr);
        bch2_opt_strs_free(&fs_opt_strs);
 
-       if (!quiet)
-               bch2_sb_print(sb, false, 1 << BCH_SB_FIELD_members, HUMAN_READABLE);
+       if (!quiet) {
+               struct printbuf buf = PRINTBUF;
+
+               buf.human_readable_units = true;
+
+               bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members);
+               printf("%s", buf.buf);
+
+               printbuf_exit(&buf);
+       }
        free(sb);
 
        if (opts.passphrase) {
@@ -245,24 +253,29 @@ int cmd_format(int argc, char *argv[])
                free(opts.passphrase);
        }
 
-       darray_free(devices);
+       darray_exit(&devices);
 
        if (initialize) {
+               struct bch_opts mount_opts = bch2_opts_empty();
+
+
+               opt_set(mount_opts, verbose, verbose);
+
                /*
                 * Start the filesystem once, to allocate the journal and create
                 * the root directory:
                 */
-               struct bch_fs *c = bch2_fs_open(device_paths.item,
-                                               darray_size(device_paths),
-                                               bch2_opts_empty());
+               struct bch_fs *c = bch2_fs_open(device_paths.data,
+                                               device_paths.nr,
+                                               mount_opts);
                if (IS_ERR(c))
-                       die("error opening %s: %s", device_paths.item[0],
+                       die("error opening %s: %s", device_paths.data[0],
                            strerror(-PTR_ERR(c)));
 
                bch2_fs_stop(c);
        }
 
-       darray_free(device_paths);
+       darray_exit(&device_paths);
 
        return 0;
 }
@@ -276,7 +289,7 @@ static void show_super_usage(void)
             "  -f, --fields=(fields)       list of sections to print\n"
             "  -l, --layout                print superblock layout\n"
             "  -h, --help                  display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
        exit(EXIT_SUCCESS);
 }
 
@@ -325,7 +338,14 @@ int cmd_show_super(int argc, char *argv[])
        if (ret)
                die("Error opening %s: %s", dev, strerror(-ret));
 
-       bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE);
+       struct printbuf buf = PRINTBUF;
+
+       buf.human_readable_units = true;
+
+       bch2_sb_to_text(&buf, sb.sb, print_layout, fields);
+       printf("%s", buf.buf);
+
        bch2_free_super(&sb);
+       printbuf_exit(&buf);
        return 0;
 }
index f8c46429af60344a448849eaf29980df33d27308..007c8d87a64fca68422f2de6f904d2221cfa97f3 100644 (file)
--- a/cmd_fs.c
+++ b/cmd_fs.c
@@ -4,66 +4,96 @@
 
 #include <uuid/uuid.h>
 
-#include "ccan/darray/darray.h"
-
 #include "linux/sort.h"
 
 #include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/darray.h"
 #include "libbcachefs/opts.h"
 
 #include "cmds.h"
 #include "libbcachefs.h"
 
-static void print_dev_usage_type(const char *type,
-                                unsigned bucket_size,
-                                u64 buckets, u64 sectors,
-                                enum units units)
+static void __dev_usage_type_to_text(struct printbuf *out,
+                                    const char *type,
+                                    unsigned bucket_size,
+                                    u64 buckets, u64 sectors, u64 frag)
 {
-       u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL);
+       prt_printf(out, "%s:", type);
+       prt_tab(out);
+
+       prt_units_u64(out, sectors << 9);
+       prt_tab_rjust(out);
+
+       prt_printf(out, "%llu", buckets);
+       prt_tab_rjust(out);
+
+       if (frag) {
+               prt_units_u64(out, frag << 9);
+               prt_tab_rjust(out);
+       }
+       prt_newline(out);
+}
 
-       printf_pad(20, "  %s:", type);
-       printf(" %15s %15llu %15s\n",
-              pr_units(sectors, units),
-              buckets,
-              pr_units(frag, units));
+static void dev_usage_type_to_text(struct printbuf *out,
+                                  struct bch_ioctl_dev_usage *u,
+                                  enum bch_data_type type)
+{
+       __dev_usage_type_to_text(out, bch2_data_types[type],
+                       u->bucket_size,
+                       u->d[type].buckets,
+                       u->d[type].sectors,
+                       u->d[type].fragmented);
 }
 
-static void print_dev_usage(struct bchfs_handle fs,
-                           struct dev_name *d,
-                           enum units units)
+static void dev_usage_to_text(struct printbuf *out,
+                             struct bchfs_handle fs,
+                             struct dev_name *d)
 {
        struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, d->idx);
        unsigned i;
 
-       printf("\n");
-       printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx);
-       printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]);
-
-       printf("%-20s%16s%16s%16s\n",
-              "", "data", "buckets", "fragmented");
-
-       for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++)
-               print_dev_usage_type(bch2_data_types[i],
-                                    u.bucket_size,
-                                    u.buckets[i],
-                                    u.sectors[i],
-                                    units);
-
-       print_dev_usage_type("erasure coded",
-                            u.bucket_size,
-                            u.ec_buckets,
-                            u.ec_sectors,
-                            units);
-
-       printf_pad(20, "  available:");
-       printf(" %15s %15llu\n",
-              pr_units(u.available_buckets * u.bucket_size, units),
-              u.available_buckets);
-
-       printf_pad(20, "  capacity:");
-       printf(" %15s %15llu\n",
-              pr_units(u.nr_buckets * u.bucket_size, units),
-              u.nr_buckets);
+       prt_newline(out);
+       prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx);
+       prt_tab(out);
+       prt_str(out, d->dev ?: "(device not found)");
+       prt_tab_rjust(out);
+
+       prt_str(out, bch2_member_states[u.state]);
+       prt_tab_rjust(out);
+
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       prt_tab(out);
+
+       prt_str(out, "data");
+       prt_tab_rjust(out);
+
+       prt_str(out, "buckets");
+       prt_tab_rjust(out);
+
+       prt_str(out, "fragmented");
+       prt_tab_rjust(out);
+
+       prt_newline(out);
+
+       for (i = 0; i < BCH_DATA_NR; i++)
+               dev_usage_type_to_text(out, &u, i);
+       __dev_usage_type_to_text(out, "erasure coded",
+                                u.bucket_size,
+                                u.buckets_ec, u.buckets_ec * u.bucket_size, 0);
+
+       prt_str(out, "capacity:");
+       prt_tab(out);
+
+       prt_units_u64(out, (u.nr_buckets * u.bucket_size) << 9);
+       prt_tab_rjust(out);
+       prt_printf(out, "%llu", u.nr_buckets);
+       prt_tab_rjust(out);
+
+       printbuf_indent_sub(out, 2);
+
+       prt_newline(out);
 }
 
 static int dev_by_label_cmp(const void *_l, const void *_r)
@@ -81,15 +111,16 @@ static struct dev_name *dev_idx_to_name(dev_names *dev_names, unsigned idx)
 {
        struct dev_name *dev;
 
-       darray_foreach(dev, *dev_names)
+       darray_for_each(*dev_names, dev)
                if (dev->idx == idx)
                        return dev;
 
        return NULL;
 }
 
-static void print_replicas_usage(const struct bch_replicas_usage *r,
-                                dev_names *dev_names, enum units units)
+static void replicas_usage_to_text(struct printbuf *out,
+                                  const struct bch_replicas_usage *r,
+                                  dev_names *dev_names)
 {
        unsigned i;
 
@@ -113,10 +144,18 @@ static void print_replicas_usage(const struct bch_replicas_usage *r,
        *d++ = ']';
        *d++ = '\0';
 
-       printf_pad(16, "%s: ", bch2_data_types[r->r.data_type]);
-       printf_pad(16, "%u/%u ", r->r.nr_required, r->r.nr_devs);
-       printf_pad(32, "%s ", devs);
-       printf(" %s\n", pr_units(r->sectors, units));
+       prt_printf(out, "%s: ", bch2_data_types[r->r.data_type]);
+       prt_tab(out);
+
+       prt_printf(out, "%u/%u ", r->r.nr_required, r->r.nr_devs);
+       prt_tab(out);
+
+       prt_printf(out, "%s ", devs);
+       prt_tab(out);
+
+       prt_units_u64(out, r->sectors << 9);
+       prt_tab_rjust(out);
+       prt_newline(out);
 }
 
 #define for_each_usage_replica(_u, _r)                                 \
@@ -125,10 +164,9 @@ static void print_replicas_usage(const struct bch_replicas_usage *r,
             _r = replicas_usage_next(_r),                              \
             BUG_ON((void *) _r > (void *) (_u)->replicas + (_u)->replica_entries_bytes))
 
-static void print_fs_usage(const char *path, enum units units)
+static void fs_usage_to_text(struct printbuf *out, const char *path)
 {
        unsigned i;
-       char uuid[40];
 
        struct bchfs_handle fs = bcache_fs_open(path);
 
@@ -137,60 +175,102 @@ static void print_fs_usage(const char *path, enum units units)
 
        struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs);
 
-       uuid_unparse(fs.uuid.b, uuid);
-       printf("Filesystem %s:\n", uuid);
+       prt_str(out, "Filesystem: ");
+       pr_uuid(out, fs.uuid.b);
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 20);
+       printbuf_tabstop_push(out, 16);
 
-       printf("%-20s%12s\n", "Size:", pr_units(u->capacity, units));
-       printf("%-20s%12s\n", "Used:", pr_units(u->used, units));
+       prt_str(out, "Size:");
+       prt_tab(out);
+       prt_units_u64(out, u->capacity << 9);
+       prt_tab_rjust(out);
+       prt_newline(out);
 
-       printf("%-20s%12s\n", "Online reserved:", pr_units(u->online_reserved, units));
+       prt_str(out, "Used:");
+       prt_tab(out);
+       prt_units_u64(out, u->used << 9);
+       prt_tab_rjust(out);
+       prt_newline(out);
 
-       printf("\n");
-       printf("%-16s%-16s%s\n", "Data type", "Required/total", "Devices");
+       prt_str(out, "Online reserved:");
+       prt_tab(out);
+       prt_units_u64(out, u->online_reserved << 9);
+       prt_tab_rjust(out);
+       prt_newline(out);
+
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 16);
+       printbuf_tabstop_push(out, 16);
+       printbuf_tabstop_push(out, 18);
+       printbuf_tabstop_push(out, 18);
+
+       prt_str(out, "Data type");
+       prt_tab(out);
+
+       prt_str(out, "Required/total");
+       prt_tab(out);
+
+       prt_str(out, "Devices");
+       prt_newline(out);
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                if (!u->persistent_reserved[i])
                        continue;
 
-               printf_pad(16, "%s: ", "reserved");
-               printf_pad(16, "%u/%u ", 1, i);
-               printf_pad(32, "[] ");
-               printf("%s\n", pr_units(u->persistent_reserved[i], units));
+               prt_str(out, "reserved:");
+               prt_tab(out);
+               prt_printf(out, "%u/%u ", 1, i);
+               prt_tab(out);
+               prt_str(out, "[] ");
+               prt_units_u64(out, u->persistent_reserved[i] << 9);
+               prt_tab_rjust(out);
+               prt_newline(out);
        }
 
        struct bch_replicas_usage *r;
 
        for_each_usage_replica(u, r)
                if (r->r.data_type < BCH_DATA_user)
-                       print_replicas_usage(r, &dev_names, units);
+                       replicas_usage_to_text(out, r, &dev_names);
 
        for_each_usage_replica(u, r)
                if (r->r.data_type == BCH_DATA_user &&
                    r->r.nr_required <= 1)
-                       print_replicas_usage(r, &dev_names, units);
+                       replicas_usage_to_text(out, r, &dev_names);
 
        for_each_usage_replica(u, r)
                if (r->r.data_type == BCH_DATA_user &&
                    r->r.nr_required > 1)
-                       print_replicas_usage(r, &dev_names, units);
+                       replicas_usage_to_text(out, r, &dev_names);
 
        for_each_usage_replica(u, r)
                if (r->r.data_type > BCH_DATA_user)
-                       print_replicas_usage(r, &dev_names, units);
+                       replicas_usage_to_text(out, r, &dev_names);
 
        free(u);
 
-       sort(&darray_item(dev_names, 0), darray_size(dev_names),
-            sizeof(darray_item(dev_names, 0)), dev_by_label_cmp, NULL);
+       sort(dev_names.data, dev_names.nr,
+            sizeof(dev_names.data[0]), dev_by_label_cmp, NULL);
+
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 16);
+       printbuf_tabstop_push(out, 20);
+       printbuf_tabstop_push(out, 16);
+       printbuf_tabstop_push(out, 14);
 
-       darray_foreach(dev, dev_names)
-               print_dev_usage(fs, dev, units);
+       darray_for_each(dev_names, dev)
+               dev_usage_to_text(out, fs, dev);
 
-       darray_foreach(dev, dev_names) {
+       darray_for_each(dev_names, dev) {
                free(dev->dev);
                free(dev->label);
        }
-       darray_free(dev_names);
+       darray_exit(&dev_names);
 
        bcache_fs_close(fs);
 }
@@ -209,24 +289,33 @@ int fs_usage(void)
 
 int cmd_fs_usage(int argc, char *argv[])
 {
-       enum units units = BYTES;
+       bool human_readable = false;
+       struct printbuf buf = PRINTBUF;
        char *fs;
        int opt;
 
        while ((opt = getopt(argc, argv, "h")) != -1)
                switch (opt) {
                case 'h':
-                       units = HUMAN_READABLE;
+                       human_readable = true;
                        break;
                }
        args_shift(optind);
 
        if (!argc) {
-               print_fs_usage(".", units);
+               printbuf_reset(&buf);
+               buf.human_readable_units = human_readable;
+               fs_usage_to_text(&buf, ".");
+               printf("%s", buf.buf);
        } else {
-               while ((fs = arg_pop()))
-                       print_fs_usage(fs, units);
+               while ((fs = arg_pop())) {
+                       printbuf_reset(&buf);
+                       buf.human_readable_units = human_readable;
+                       fs_usage_to_text(&buf, fs);
+                       printf("%s", buf.buf);
+               }
        }
 
+       printbuf_exit(&buf);
        return 0;
 }
index 6052cb0061762b8ed8d6a1f4c56dfc8c7e54ee41..63b0541cb15a038ff879d5873d9ca0f311f8636f 100644 (file)
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -14,20 +14,26 @@ static void unlock_usage(void)
             "\n"
             "Options:\n"
             "  -c                     Check if a device is encrypted\n"
+            "  -k (session|user|user_session)\n"
+            "                         Keyring to add to (default: user)\n"
             "  -h                     Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_unlock(int argc, char *argv[])
 {
+       const char *keyring = "user";
        bool check = false;
        int opt;
 
-       while ((opt = getopt(argc, argv, "ch")) != -1)
+       while ((opt = getopt(argc, argv, "ck:h")) != -1)
                switch (opt) {
                case 'c':
                        check = true;
                        break;
+               case 'k':
+                       keyring = strdup(optarg);
+                       break;
                case 'h':
                        unlock_usage();
                        exit(EXIT_SUCCESS);
@@ -59,7 +65,7 @@ int cmd_unlock(int argc, char *argv[])
 
        char *passphrase = read_passphrase("Enter passphrase: ");
 
-       bch2_add_key(sb.sb, passphrase);
+       bch2_add_key(sb.sb, "user", keyring, passphrase);
 
        bch2_free_super(&sb);
        memzero_explicit(passphrase, strlen(passphrase));
similarity index 57%
rename from cmd_debug.c
rename to cmd_list.c
index 6ff58a96642bc2cabaee6f5ed2224b3a2db19616..382153da9d43b39fef1221450715c32e1756ccc1 100644 (file)
 #include "tools-util.h"
 
 #include "libbcachefs/bcachefs.h"
-#include "libbcachefs/bset.h"
 #include "libbcachefs/btree_cache.h"
 #include "libbcachefs/btree_io.h"
 #include "libbcachefs/btree_iter.h"
-#include "libbcachefs/buckets.h"
 #include "libbcachefs/checksum.h"
 #include "libbcachefs/error.h"
-#include "libbcachefs/journal.h"
-#include "libbcachefs/journal_io.h"
+#include "libbcachefs/extents.h"
 #include "libbcachefs/super.h"
 
-static void dump_usage(void)
-{
-       puts("bcachefs dump - dump filesystem metadata\n"
-            "Usage: bcachefs dump [OPTION]... <devices>\n"
-            "\n"
-            "Options:\n"
-            "  -o output     Output qcow2 image(s)\n"
-            "  -f            Force; overwrite when needed\n"
-            "  -h            Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
-{
-       struct bch_sb *sb = ca->disk_sb.sb;
-       ranges data;
-       unsigned i;
-       int ret;
-
-       darray_init(data);
-
-       /* Superblock: */
-       range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
-                 sizeof(struct bch_sb_layout));
-
-       for (i = 0; i < sb->layout.nr_superblocks; i++)
-               range_add(&data,
-                         le64_to_cpu(sb->layout.sb_offset[i]) << 9,
-                         vstruct_bytes(sb));
-
-       /* Journal: */
-       for (i = 0; i < ca->journal.nr; i++)
-               if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
-                       u64 bucket = ca->journal.buckets[i];
-
-                       range_add(&data,
-                                 bucket_bytes(ca) * bucket,
-                                 bucket_bytes(ca));
-               }
-
-       /* Btree: */
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               const struct bch_extent_ptr *ptr;
-               struct bkey_ptrs_c ptrs;
-               struct btree_trans trans;
-               struct btree_iter iter;
-               struct btree *b;
-
-               bch2_trans_init(&trans, c, 0, 0);
-
-               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
-                       struct btree_node_iter iter;
-                       struct bkey u;
-                       struct bkey_s_c k;
-
-                       for_each_btree_node_key_unpack(b, k, &iter, &u) {
-                               ptrs = bch2_bkey_ptrs_c(k);
-
-                               bkey_for_each_ptr(ptrs, ptr)
-                                       if (ptr->dev == ca->dev_idx)
-                                               range_add(&data,
-                                                         ptr->offset << 9,
-                                                         btree_bytes(c));
-                       }
-               }
-
-               if (ret)
-                       die("error %s walking btree nodes", strerror(-ret));
-
-               b = c->btree_roots[i].b;
-               if (!btree_node_fake(b)) {
-                       ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
-
-                       bkey_for_each_ptr(ptrs, ptr)
-                               if (ptr->dev == ca->dev_idx)
-                                       range_add(&data,
-                                                 ptr->offset << 9,
-                                                 btree_bytes(c));
-               }
-
-               bch2_trans_iter_exit(&trans, &iter);
-               bch2_trans_exit(&trans);
-       }
-
-       qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
-                         max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
-       darray_free(data);
-}
-
-int cmd_dump(int argc, char *argv[])
-{
-       struct bch_opts opts = bch2_opts_empty();
-       struct bch_dev *ca;
-       char *out = NULL;
-       unsigned i, nr_devices = 0;
-       bool force = false;
-       int fd, opt;
-
-       opt_set(opts, nochanges,        true);
-       opt_set(opts, norecovery,       true);
-       opt_set(opts, degraded,         true);
-       opt_set(opts, errors,           BCH_ON_ERROR_continue);
-       opt_set(opts, fix_errors,       FSCK_OPT_NO);
-
-       while ((opt = getopt(argc, argv, "o:fvh")) != -1)
-               switch (opt) {
-               case 'o':
-                       out = optarg;
-                       break;
-               case 'f':
-                       force = true;
-                       break;
-               case 'v':
-                       opt_set(opts, verbose, true);
-                       break;
-               case 'h':
-                       dump_usage();
-                       exit(EXIT_SUCCESS);
-               }
-       args_shift(optind);
-
-       if (!out)
-               die("Please supply output filename");
-
-       if (!argc)
-               die("Please supply device(s) to check");
-
-       struct bch_fs *c = bch2_fs_open(argv, argc, opts);
-       if (IS_ERR(c))
-               die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
-       down_read(&c->gc_lock);
-
-       for_each_online_member(ca, c, i)
-               nr_devices++;
-
-       BUG_ON(!nr_devices);
-
-       for_each_online_member(ca, c, i) {
-               int flags = O_WRONLY|O_CREAT|O_TRUNC;
-
-               if (!force)
-                       flags |= O_EXCL;
-
-               if (!c->devs[i])
-                       continue;
-
-               char *path = nr_devices > 1
-                       ? mprintf("%s.%u", out, i)
-                       : strdup(out);
-               fd = xopen(path, flags, 0600);
-               free(path);
-
-               dump_one_device(c, ca, fd);
-               close(fd);
-       }
-
-       up_read(&c->gc_lock);
-
-       bch2_fs_stop(c);
-       return 0;
-}
-
 static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                      struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       char buf[512];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -200,12 +34,15 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                if (bkey_cmp(k.k->p, end) > 0)
                        break;
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, k);
-               puts(buf);
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, k);
+               puts(buf.buf);
        }
        bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
+
+       printbuf_exit(&buf);
 }
 
 static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigned level,
@@ -214,7 +51,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
        struct btree_trans trans;
        struct btree_iter iter;
        struct btree *b;
-       char buf[4096];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -223,8 +60,9 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
-               bch2_btree_node_to_text(&PBUF(buf), c, b);
-               puts(buf);
+               printbuf_reset(&buf);
+               bch2_btree_node_to_text(&buf, c, b);
+               puts(buf.buf);
        }
        bch2_trans_iter_exit(&trans, &iter);
 
@@ -232,6 +70,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
                die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
+       printbuf_exit(&buf);
 }
 
 static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
@@ -240,7 +79,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
        struct btree_trans trans;
        struct btree_iter iter;
        struct btree *b;
-       char buf[4096];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -249,8 +88,9 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
-               fputs(buf, stdout);
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+               fputs(buf.buf, stdout);
                putchar('\n');
        }
        bch2_trans_iter_exit(&trans, &iter);
@@ -259,6 +99,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
+       printbuf_exit(&buf);
 }
 
 static void print_node_ondisk(struct bch_fs *c, struct btree *b)
@@ -268,6 +109,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
        struct bch_dev *ca;
        struct bio *bio;
        unsigned offset = 0;
+       int ret;
 
        if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
                printf("error getting device to read from\n");
@@ -280,17 +122,19 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
                return;
        }
 
-       n_ondisk = malloc(btree_bytes(c));
+       n_ondisk = aligned_alloc(block_bytes(c), btree_bytes(c));
 
-       bio = bio_alloc_bioset(GFP_NOIO,
-                       buf_pages(n_ondisk, btree_bytes(c)),
-                       &c->btree_bio);
-       bio_set_dev(bio, ca->disk_sb.bdev);
-       bio->bi_opf             = REQ_OP_READ|REQ_META;
+       bio = bio_alloc_bioset(ca->disk_sb.bdev,
+                              buf_pages(n_ondisk, btree_bytes(c)),
+                              REQ_OP_READ|REQ_META,
+                              GFP_NOIO,
+                              &c->btree_bio);
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bch2_bio_map(bio, n_ondisk, btree_bytes(c));
 
-       submit_bio_wait(bio);
+       ret = submit_bio_wait(bio);
+       if (ret)
+               die("error reading btree node: %i", ret);
 
        bio_put(bio);
        percpu_ref_put(&ca->io_ref);
@@ -306,7 +150,8 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
                        i = &n_ondisk->keys;
 
                        if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-                               die("unknown checksum type");
+                               die("unknown checksum type at offset %u: %llu",
+                                   offset, BSET_CSUM_TYPE(i));
 
                        nonce = btree_nonce(i, offset << 9);
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
@@ -326,7 +171,8 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
                                break;
 
                        if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-                               die("unknown checksum type");
+                               die("unknown checksum type at offset %u: %llu",
+                                   offset, BSET_CSUM_TYPE(i));
 
                        nonce = btree_nonce(i, offset << 9);
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
@@ -347,10 +193,14 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
 
                for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
                        struct bkey u;
-                       char buf[4096];
+                       struct printbuf buf = PRINTBUF;
+
+                       printbuf_indent_add(&buf, 4);
+
+                       bch2_bkey_val_to_text(&buf, c, bkey_disassemble(b, k, &u));
+                       fprintf(stdout, "%s\n", buf.buf);
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_disassemble(b, k, &u));
-                       fprintf(stdout, "    %s\n", buf);
+                       printbuf_exit(&buf);
                }
        }
 
@@ -363,7 +213,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
        struct btree_trans trans;
        struct btree_iter iter;
        struct btree *b;
-       char buf[4096];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -372,8 +222,9 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
-               fputs(buf, stdout);
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+               fputs(buf.buf, stdout);
                putchar('\n');
 
                print_node_ondisk(c, b);
@@ -384,6 +235,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
                die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
+       printbuf_exit(&buf);
 }
 
 static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned level,
@@ -395,7 +247,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
        struct bkey unpacked;
        struct bkey_s_c k;
        struct btree *b;
-       char buf[4096];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -404,13 +256,15 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
-               bch2_btree_node_to_text(&PBUF(buf), c, b);
-               fputs(buf, stdout);
+               printbuf_reset(&buf);
+               bch2_btree_node_to_text(&buf, c, b);
+               fputs(buf.buf, stdout);
 
                for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) {
-                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       printbuf_reset(&buf);
+                       bch2_bkey_val_to_text(&buf, c, k);
                        putchar('\t');
-                       puts(buf);
+                       puts(buf.buf);
                }
        }
        bch2_trans_iter_exit(&trans, &iter);
@@ -419,6 +273,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
+       printbuf_exit(&buf);
 }
 
 static void list_keys_usage(void)
@@ -437,7 +292,7 @@ static void list_keys_usage(void)
             "  -f                                    Check (fsck) the filesystem first\n"
             "  -v                                    Verbose mode\n"
             "  -h                                    Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 #define LIST_MODES()           \
@@ -551,70 +406,3 @@ int cmd_list(int argc, char *argv[])
        bch2_fs_stop(c);
        return 0;
 }
-
-static void list_journal_usage(void)
-{
-       puts("bcachefs list_journal - print contents of journal\n"
-            "Usage: bcachefs list_journal [OPTION]... <devices>\n"
-            "\n"
-            "Options:\n"
-            "  -a            Read entire journal, not just dirty entries\n"
-            "  -h            Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_list_journal(int argc, char *argv[])
-{
-       struct bch_opts opts = bch2_opts_empty();
-       int opt;
-
-       opt_set(opts, nochanges,        true);
-       opt_set(opts, norecovery,       true);
-       opt_set(opts, degraded,         true);
-       opt_set(opts, errors,           BCH_ON_ERROR_continue);
-       opt_set(opts, fix_errors,       FSCK_OPT_YES);
-       opt_set(opts, keep_journal,     true);
-
-       while ((opt = getopt(argc, argv, "ah")) != -1)
-               switch (opt) {
-               case 'a':
-                       opt_set(opts, read_entire_journal, true);
-                       break;
-               case 'h':
-                       list_journal_usage();
-                       exit(EXIT_SUCCESS);
-               }
-       args_shift(optind);
-
-       if (!argc)
-               die("Please supply device(s) to open");
-
-       struct bch_fs *c = bch2_fs_open(argv, argc, opts);
-       if (IS_ERR(c))
-               die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
-       struct journal_replay *p;
-       struct jset_entry *entry;
-
-       list_for_each_entry(p, &c->journal_entries, list) {
-               printf("journal entry   %8llu\n"
-                      "    version     %8u\n"
-                      "    last seq    %8llu\n"
-                      ,
-                      le64_to_cpu(p->j.seq),
-                      le32_to_cpu(p->j.version),
-                      le64_to_cpu(p->j.last_seq));
-
-               vstruct_for_each(&p->j, entry) {
-                       char _buf[4096];
-                       struct printbuf buf = PBUF(_buf);
-
-                       printbuf_indent_push(&buf, 2);
-                       bch2_journal_entry_to_text(&buf, c, entry);
-                       printf("%s\n", _buf);
-               }
-       }
-
-       bch2_fs_stop(c);
-       return 0;
-}
diff --git a/cmd_list_journal.c b/cmd_list_journal.c
new file mode 100644 (file)
index 0000000..869d334
--- /dev/null
@@ -0,0 +1,246 @@
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "qcow2.h"
+#include "tools-util.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/journal_io.h"
+#include "libbcachefs/journal_seq_blacklist.h"
+#include "libbcachefs/super.h"
+
+static void list_journal_usage(void)
+{
+       puts("bcachefs list_journal - print contents of journal\n"
+            "Usage: bcachefs list_journal [OPTION]... <devices>\n"
+            "\n"
+            "Options:\n"
+            "  -a            Read entire journal, not just dirty entries\n"
+            "  -n            Number of journal entries to print, starting from the most recent\n"
+            "  -v            Verbose mode\n"
+            "  -h            Display this help and exit\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void star_start_of_lines(char *buf)
+{
+       char *p = buf;
+
+       if (*p == ' ')
+               *p = '*';
+
+       while ((p = strstr(p, "\n ")))
+               p[1] = '*';
+}
+
+int cmd_list_journal(int argc, char *argv[])
+{
+       struct bch_opts opts = bch2_opts_empty();
+       u32 nr_entries = U32_MAX;
+       int opt;
+
+       opt_set(opts, nochanges,        true);
+       opt_set(opts, norecovery,       true);
+       opt_set(opts, degraded,         true);
+       opt_set(opts, errors,           BCH_ON_ERROR_continue);
+       opt_set(opts, fix_errors,       FSCK_OPT_YES);
+       opt_set(opts, keep_journal,     true);
+       opt_set(opts, read_journal_only,true);
+
+       while ((opt = getopt(argc, argv, "an:vh")) != -1)
+               switch (opt) {
+               case 'a':
+                       opt_set(opts, read_entire_journal, true);
+                       break;
+               case 'n':
+                       nr_entries = kstrtouint(optarg, 10, &nr_entries);
+                       opt_set(opts, read_entire_journal, true);
+                       break;
+               case 'v':
+                       opt_set(opts, verbose, true);
+                       break;
+               case 'h':
+                       list_journal_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       if (!argc)
+               die("Please supply device(s) to open");
+
+       struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+       if (IS_ERR(c))
+               die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+       struct journal_replay *p, **_p;
+       struct genradix_iter iter;
+       struct jset_entry *entry;
+       struct printbuf buf = PRINTBUF;
+
+       genradix_for_each(&c->journal_entries, iter, _p) {
+               p = *_p;
+               if (!p)
+                       continue;
+
+               if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq))
+                       continue;
+
+               bool blacklisted =
+                       bch2_journal_seq_is_blacklisted(c,
+                                       le64_to_cpu(p->j.seq), false);
+
+               if (blacklisted)
+                       printf("blacklisted ");
+
+               printf("journal entry       %llu\n", le64_to_cpu(p->j.seq));
+
+               printbuf_reset(&buf);
+
+               prt_printf(&buf,
+                      "  version         %u\n"
+                      "  last seq        %llu\n"
+                      "  flush           %u\n"
+                      "  written at      ",
+                      le32_to_cpu(p->j.version),
+                      le64_to_cpu(p->j.last_seq),
+                      !JSET_NO_FLUSH(&p->j));
+               bch2_journal_ptrs_to_text(&buf, c, p);
+
+               if (blacklisted)
+                       star_start_of_lines(buf.buf);
+               printf("%s\n", buf.buf);
+
+               vstruct_for_each(&p->j, entry) {
+                       printbuf_reset(&buf);
+
+                       /*
+                        * log entries denote the start of a new transaction
+                        * commit:
+                        */
+                       if (entry->type == BCH_JSET_ENTRY_log && !entry->level)
+                               prt_newline(&buf);
+                       printbuf_indent_add(&buf, 4);
+                       bch2_journal_entry_to_text(&buf, c, entry);
+
+                       if (blacklisted)
+                               star_start_of_lines(buf.buf);
+                       printf("%s\n", buf.buf);
+               }
+       }
+
+       printbuf_exit(&buf);
+       bch2_fs_stop(c);
+       return 0;
+}
+
+static void kill_btree_node_usage(void)
+{
+       puts("bcachefs kill_btree_node - make btree nodes unreadable\n"
+            "Usage: bcachefs kill_btree_node [OPTION]... <devices>\n"
+            "\n"
+            "Options:\n"
+            "  -b (extents|inodes|dirents|xattrs)    Btree to delete from\n"
+            "  -l level                              Levle to delete from (0 == leaves)\n"
+            "  -i index                              Index of btree node to kill\n"
+            "  -h                                    Display this help and exit\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_kill_btree_node(int argc, char *argv[])
+{
+       struct bch_opts opts = bch2_opts_empty();
+       enum btree_id btree_id = 0;
+       unsigned level = 0;
+       u64 node_index = 0;
+       int opt;
+
+       opt_set(opts, read_only,        true);
+
+       while ((opt = getopt(argc, argv, "b:l:i:h")) != -1)
+               switch (opt) {
+               case 'b':
+                       btree_id = read_string_list_or_die(optarg,
+                                               bch2_btree_ids, "btree id");
+                       break;
+               case 'l':
+                       if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH)
+                               die("invalid level");
+                       break;
+               case 'i':
+                       if (kstrtoull(optarg, 10, &node_index))
+                               die("invalid index %s", optarg);
+                       break;
+               case 'h':
+                       kill_btree_node_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       if (!argc)
+               die("Please supply device(s)");
+
+       struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+       if (IS_ERR(c))
+               die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct btree *b;
+       int ret;
+       void *zeroes;
+
+       ret = posix_memalign(&zeroes, c->opts.block_size, c->opts.block_size);
+       if (ret)
+               die("error %s from posix_memalign", strerror(ret));
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+               if (b->c.level != level)
+                       continue;
+
+               if (!node_index) {
+                       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+                       const struct bch_extent_ptr *ptr;
+
+                       struct printbuf buf = PRINTBUF;
+
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+                       bch_info(c, "killing btree node %s", buf.buf);
+                       printbuf_exit(&buf);
+
+                       bkey_for_each_ptr(ptrs, ptr) {
+                               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+                               ret = pwrite(ca->disk_sb.bdev->bd_fd, zeroes,
+                                            c->opts.block_size, ptr->offset << 9);
+                               if (ret != c->opts.block_size) {
+                                       bch_err(c, "pwrite error: expected %u got %i %s",
+                                               c->opts.block_size, ret, strerror(errno));
+                                       ret = EXIT_FAILURE;
+                                       goto done;
+                               }
+                       }
+                       goto done;
+               }
+
+               node_index--;
+       }
+       if (ret)
+               bch_err(c, "error %i walking btree nodes", ret);
+       else
+               bch_err(c, "node at specified index not found");
+       ret = EXIT_FAILURE;
+done:
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
+
+       bch2_fs_stop(c);
+       return ret;
+}
index 4da3ab1b58c6e0125f68c24e1771db4335d6b244..3ba51c0c2ebd5d280adc7062c9d96e28473157e1 100644 (file)
@@ -122,7 +122,7 @@ static void update_inode(struct bch_fs *c,
        struct bkey_inode_buf packed;
        int ret;
 
-       bch2_inode_pack(c, &packed, inode);
+       bch2_inode_pack(&packed, inode);
        packed.inode.k.p.snapshot = U32_MAX;
        ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
                                NULL, NULL, 0);
@@ -257,7 +257,7 @@ static void write_data(struct bch_fs *c,
 
        closure_init_stack(&cl);
 
-       bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv));
+       bio_init(&op.wbio.bio, NULL, bv, ARRAY_SIZE(bv), 0);
        bch2_bio_map(&op.wbio.bio, buf, len);
 
        bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts));
@@ -530,7 +530,7 @@ static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
 
        struct fiemap_iter iter;
        struct fiemap_extent e;
-       ranges extents = { NULL };
+       ranges extents = { 0 };
 
        fiemap_for_each(fd, iter, e) {
                if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
@@ -603,7 +603,7 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
 
        update_inode(c, &root_inode);
 
-       darray_free(s.extents);
+       darray_exit(&s.extents);
        genradix_free(&s.hardlinks);
 }
 
@@ -613,7 +613,7 @@ static void find_superblock_space(ranges extents,
 {
        struct range *i;
 
-       darray_foreach(i, extents) {
+       darray_for_each(extents, i) {
                u64 start = round_up(max(256ULL << 10, i->start),
                                     dev->bucket_size << 9);
                u64 end = round_down(i->end,
@@ -641,7 +641,7 @@ static void migrate_usage(void)
             "      --no_passphrase    Don't encrypt master encryption key\n"
             "  -F                     Force, even if metadata file already exists\n"
             "  -h                     Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 static const struct option migrate_opts[] = {
@@ -691,7 +691,7 @@ static int migrate_fs(const char            *fs_path,
        u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
 
        if (format_opts.passphrase)
-               bch2_add_key(sb, format_opts.passphrase);
+               bch2_add_key(sb, "user", "user", format_opts.passphrase);
 
        free(sb);
 
@@ -799,7 +799,7 @@ static void migrate_superblock_usage(void)
             "  -d device     Device to create superblock for\n"
             "  -o offset     Offset of existing superblock\n"
             "  -h            Display this help and exit\n"
-            "Report bugs to <linux-bcache@vger.kernel.org>");
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_migrate_superblock(int argc, char *argv[])
diff --git a/cmd_option.c b/cmd_option.c
new file mode 100644 (file)
index 0000000..86768e5
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * Authors: Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * GPLv2
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super-io.h"
+
+static void set_option_usage(void)
+{
+       puts("bcachefs set-option \n"
+            "Usage: bcachefs set-option [OPTION].. device\n"
+            "\n"
+            "Options:\n");
+       bch2_opts_usage(OPT_MOUNT);
+       puts("  -h, --help                  display this help and exit\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+       exit(EXIT_SUCCESS);
+}
+
+int cmd_set_option(int argc, char *argv[])
+{
+       struct bch_opt_strs new_opt_strs = bch2_cmdline_opts_get(&argc, argv, OPT_MOUNT);
+       struct bch_opts new_opts = bch2_parse_opts(new_opt_strs);
+       struct bch_opts open_opts = bch2_opts_empty();
+       unsigned i;
+       int opt, ret = 0;
+
+       opt_set(open_opts, nostart, true);
+
+       while ((opt = getopt(argc, argv, "h")) != -1)
+               switch (opt) {
+               case 'h':
+                       set_option_usage();
+                       break;
+               }
+       args_shift(optind);
+
+       if (!argc) {
+               fprintf(stderr, "Please supply device(s)\n");
+               exit(EXIT_FAILURE);
+       }
+
+       for (i = 0; i < argc; i++)
+               if (dev_mounted(argv[i]))
+                       goto online;
+
+       struct bch_fs *c = bch2_fs_open(argv, argc, open_opts);
+       if (IS_ERR(c)) {
+               fprintf(stderr, "error opening %s: %s\n", argv[0], strerror(-PTR_ERR(c)));
+               exit(EXIT_FAILURE);
+       }
+
+       for (i = 0; i < bch2_opts_nr; i++) {
+               u64 v = bch2_opt_get_by_id(&new_opts, i);
+
+               if (!bch2_opt_defined_by_id(&new_opts, i))
+                       continue;
+
+               ret = bch2_opt_check_may_set(c, i, v);
+               if (ret < 0) {
+                       fprintf(stderr, "error setting %s: %i\n",
+                               bch2_opt_table[i].attr.name, ret);
+                       break;
+               }
+
+               bch2_opt_set_sb(c, bch2_opt_table + i, v);
+               bch2_opt_set_by_id(&c->opts, i, v);
+       }
+
+       bch2_fs_stop(c);
+       return ret;
+online:
+       {
+               unsigned dev_idx;
+               struct bchfs_handle fs = bchu_fs_open_by_dev(argv[i], &dev_idx);
+
+               for (i = 0; i < bch2_opts_nr; i++) {
+                       if (!new_opt_strs.by_id[i])
+                               continue;
+
+                       char *path = mprintf("options/%s", bch2_opt_table[i].attr.name);
+
+                       write_file_str(fs.sysfs_fd, path, new_opt_strs.by_id[i]);
+                       free(path);
+               }
+       }
+       return 0;
+}
diff --git a/cmds.h b/cmds.h
index 52db63f3040fc332812d8992fc3f88efcbd6cecb..c18a87fd6de86031a345b311688f5e40dc0b6cde 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -11,6 +11,7 @@
 
 int cmd_format(int argc, char *argv[]);
 int cmd_show_super(int argc, char *argv[]);
+int cmd_set_option(int argc, char *argv[]);
 
 #if 0
 int cmd_assemble(int argc, char *argv[]);
@@ -45,6 +46,7 @@ int cmd_fsck(int argc, char *argv[]);
 int cmd_dump(int argc, char *argv[]);
 int cmd_list(int argc, char *argv[]);
 int cmd_list_journal(int argc, char *argv[]);
+int cmd_kill_btree_node(int argc, char *argv[]);
 
 int cmd_migrate(int argc, char *argv[]);
 int cmd_migrate_superblock(int argc, char *argv[]);
index 43753a3e8902e019371d5258b37681ff320211b2..4e4d15a90fe90d7fdbf051cd122c85353c162f69 100644 (file)
--- a/crypto.c
+++ b/crypto.c
@@ -133,10 +133,23 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase,
                die("incorrect passphrase");
 }
 
-void bch2_add_key(struct bch_sb *sb, const char *passphrase)
+void bch2_add_key(struct bch_sb *sb,
+                 const char *type,
+                 const char *keyring_str,
+                 const char *passphrase)
 {
        struct bch_key passphrase_key;
        struct bch_encrypted_key sb_key;
+       int keyring;
+
+       if (!strcmp(keyring_str, "session"))
+               keyring = KEY_SPEC_SESSION_KEYRING;
+       else if (!strcmp(keyring_str, "user"))
+               keyring = KEY_SPEC_USER_KEYRING;
+       else if (!strcmp(keyring_str, "user_session"))
+               keyring = KEY_SPEC_USER_SESSION_KEYRING;
+       else
+               die("unknown keyring %s", keyring_str);
 
        bch2_passphrase_check(sb, passphrase,
                              &passphrase_key,
@@ -147,12 +160,10 @@ void bch2_add_key(struct bch_sb *sb, const char *passphrase)
 
        char *description = mprintf("bcachefs:%s", uuid);
 
-       if (add_key("logon", description,
-                   &passphrase_key, sizeof(passphrase_key),
-                   KEY_SPEC_USER_KEYRING) < 0 ||
-           add_key("user", description,
+       if (add_key(type,
+                   description,
                    &passphrase_key, sizeof(passphrase_key),
-                   KEY_SPEC_USER_KEYRING) < 0)
+                   keyring) < 0)
                die("add_key error: %m");
 
        memzero_explicit(description, strlen(description));
index 7f523c057cbbf3c9ad6a0b8a57b16b2296d95c52..baea6d86e84a9f2005fb309afa7df59868c5ef06 100644 (file)
--- a/crypto.h
+++ b/crypto.h
@@ -15,7 +15,7 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *, const char *);
 bool bch2_sb_is_encrypted(struct bch_sb *);
 void bch2_passphrase_check(struct bch_sb *, const char *,
                           struct bch_key *, struct bch_encrypted_key *);
-void bch2_add_key(struct bch_sb *, const char *);
+void bch2_add_key(struct bch_sb *, const char *, const char *, const char *);
 void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
                       const char *);
 
diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst
new file mode 100644 (file)
index 0000000..483b961
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+set -e
+
+case "$1" in
+    configure)
+       if which update-initramfs >/dev/null; then
+           update-initramfs -u
+       fi
+    ;;
+esac
+
diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm
new file mode 100644 (file)
index 0000000..6b6fe8a
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+set -e
+
+case "$1" in
+    remove)
+       if which update-initramfs >/dev/null; then
+           update-initramfs -u
+       fi
+    ;;
+esac
+
index 3cb088260aa8438a82faf6136e719ae8755148d3..a5c17b2a2b4171add4de1da48aa64bb9049532df 100644 (file)
@@ -1,3 +1,10 @@
+bcachefs-tools (23-1) unstable; urgency=medium
+
+  * New upstream release
+  * Update standards version to 4.6.1
+
+ -- Jonathan Carter <jcc@debian.org>  Mon, 31 Oct 2022 11:45:25 +0200
+
 bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium
 
   * New upstream snapshot
index 3a9d3aa627f78af899f909b969d681e0a042c909..0ece55345dc699118fe9a43f0c45413dcc2e0745 100644 (file)
@@ -2,7 +2,7 @@ Source: bcachefs-tools
 Maintainer: Jonathan Carter <jcc@debian.org>
 Section: utils
 Priority: optional
-Standards-Version: 4.6.0
+Standards-Version: 4.6.1
 Rules-Requires-Root: no
 Build-Depends: debhelper-compat (= 13),
                pkg-config,
index 2ea4bfc7b4cc3a829312a1b02edbcc8bc32b511a..1af54c83d794911c5606292af19d114fe87f87a9 100644 (file)
@@ -1 +1 @@
-bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional
+bcachefs-tools_23-1_source.buildinfo utils optional
index 48f2aa93bacabffcd38f70ad59fda0ce64391157..a693194ed1cb1752a0c2da1d2b8191138f2c0313 100644 (file)
@@ -1,6 +1,5 @@
 { lib
-, filter
-
+, doCheck ? true
 , stdenv
 , pkg-config
 , attr
@@ -20,8 +19,7 @@
 , docutils
 , nixosTests
 
-, lastModified
-, versionString ? lastModified
+, versionString ? "0.1"
 
 , inShell ? false
 , debugMode ? inShell
@@ -39,20 +37,8 @@ stdenv.mkDerivation {
 
        version = "v0.1-flake-${versionString}";
        VERSION = "v0.1-flake-${versionString}";
-       
-       src = filter.filter {
-               name = "bcachefs-tools";
-               root = ./.;
-               exclude = [
-                       ./rust-src
-                       
-                       ./.git
-                       ./nix
-                       
-                       ./flake.nix
-                       ./flake.lock
-               ];
-       };
+
+       src = (lib.cleanSource (builtins.path { name = "bcachefs-tools-src"; path = ./. ;} ));
 
        postPatch = "patchShebangs --build doc/macro2rst.py";
 
@@ -95,7 +81,7 @@ stdenv.mkDerivation {
                "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools"
        ];
 
-       doCheck = true; # needs bcachefs module loaded on builder
+       doCheck = doCheck; # needs bcachefs module loaded on builder
 
        checkInputs = [
                python39Packages.pytest
@@ -116,7 +102,7 @@ stdenv.mkDerivation {
                        rm tests/test_fuse.py
                '';
 
-       dontStrip = debugMode == true;
+       dontStrip = debugMode;
        passthru = {
                bcachefs_revision = let 
                        file = builtins.readFile ./.bcachefs_revision;
index cdbbcb390984d6a336396fbe473ba4c8eb214c34..0ad5a87dc8e893037fafd4b216cbc58518c2a390 100644 (file)
@@ -212,23 +212,19 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 
 struct bio_set {
        unsigned int front_pad;
+       unsigned int back_pad;
+       mempool_t bio_pool;
+       mempool_t bvec_pool;
 };
 
-static inline void bioset_exit(struct bio_set *bs) {}
 
 static inline void bioset_free(struct bio_set *bs)
 {
        kfree(bs);
 }
 
-static inline int bioset_init(struct bio_set *bs,
-                             unsigned pool_size,
-                             unsigned front_pad,
-                             int flags)
-{
-       bs->front_pad = front_pad;
-       return 0;
-}
+void bioset_exit(struct bio_set *);
+int bioset_init(struct bio_set *, unsigned, unsigned, int);
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
@@ -237,31 +233,22 @@ enum {
        BIOSET_NEED_RESCUER     = 1 << 1,
 };
 
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+struct bio *bio_alloc_bioset(struct block_device *, unsigned,
+                            unsigned, gfp_t, struct bio_set *);
 extern void bio_put(struct bio *);
 
 int bio_add_page(struct bio *, struct page *, unsigned, unsigned);
 
-extern void __bio_clone_fast(struct bio *, struct bio *);
-extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-
-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-       return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
-       return bio_clone_bioset(bio, gfp_mask, NULL);
+struct bio *bio_alloc_clone(struct block_device *, struct bio *,
+                           gfp_t, struct bio_set *);
 
-}
+struct bio *bio_kmalloc(unsigned int, gfp_t);
 
 extern void bio_endio(struct bio *);
 
 extern void bio_advance(struct bio *, unsigned);
 
-extern void bio_reset(struct bio *);
+extern void bio_reset(struct bio *, struct block_device *, unsigned);
 void bio_chain(struct bio *, struct bio *);
 
 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
@@ -428,20 +415,15 @@ static inline void bio_inc_remaining(struct bio *bio)
        atomic_inc(&bio->__bi_remaining);
 }
 
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-       return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
-static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
-{
-       return bio_clone_bioset(bio, gfp_mask, NULL);
-}
-
-static inline void bio_init(struct bio *bio, struct bio_vec *table,
-             unsigned short max_vecs)
+static inline void bio_init(struct bio *bio,
+                           struct block_device *bdev,
+                           struct bio_vec *table,
+                           unsigned short max_vecs,
+                           unsigned int opf)
 {
        memset(bio, 0, sizeof(*bio));
+       bio->bi_bdev = bdev;
+       bio->bi_opf = opf;
        atomic_set(&bio->__bi_remaining, 1);
        atomic_set(&bio->__bi_cnt, 1);
 
index 2fe736e95b86cc333b50019e049c0490891ddfbb..62a3f4040a73a6ea812f7b72bd691a4f38143de4 100644 (file)
@@ -137,6 +137,11 @@ static inline unsigned long hweight64(u64 w)
               __builtin_popcount(w >> 32);
 }
 
+static inline unsigned long hweight32(u32 w)
+{
+       return __builtin_popcount(w);
+}
+
 static inline unsigned long hweight8(unsigned long w)
 {
        return __builtin_popcountl(w);
index be736c8c70edd32decfc5e0222e0448212051dae..22bae25fb3442c544eb56b94fa600c39f6df5373 100644 (file)
@@ -40,6 +40,7 @@ struct block_device {
        struct gendisk          __bd_disk;
        int                     bd_fd;
        int                     bd_sync_fd;
+       int                     bd_buffered_fd;
 };
 
 #define bdev_kobj(_bdev) (&((_bdev)->kobj))
@@ -65,6 +66,8 @@ typedef u8 __bitwise blk_status_t;
 
 #define BLK_STS_AGAIN          ((__force blk_status_t)12)
 
+#define BIO_INLINE_VECS 4
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
index 4ce43b5cc629b6e4150b12eb7b61f3f281834816..01b3d4adda15f92c9b1484b7057ba2f4b9509278 100644 (file)
@@ -69,8 +69,7 @@ static inline void submit_bio(struct bio *bio)
        generic_make_request(bio);
 }
 
-int blkdev_issue_discard(struct block_device *, sector_t,
-                        sector_t, gfp_t, unsigned long);
+int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t);
 
 #define bdev_get_queue(bdev)           (&((bdev)->queue))
 
@@ -85,7 +84,7 @@ int blkdev_issue_discard(struct block_device *, sector_t,
 #define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK            (PAGE_SECTORS - 1)
 
-#define blk_queue_discard(q)           ((void) (q), 0)
+#define bdev_max_discard_sectors(bdev) ((void) (bdev), 0)
 #define blk_queue_nonrot(q)            ((void) (q), 0)
 
 unsigned bdev_logical_block_size(struct block_device *bdev);
index 77260f37cacd70d62899ddf08bc5bae2a8b8cc8f..1a10f7e66144cb28db87a10f17df9dd7ec1ffa12 100644 (file)
@@ -2,6 +2,7 @@
 #define __TOOLS_LINUX_BUG_H
 
 #include <assert.h>
+#include <stdio.h>
 #include <linux/compiler.h>
 
 #ifdef CONFIG_VALGRIND
@@ -17,7 +18,7 @@
 
 #define BUILD_BUG_ON(cond)     ((void)sizeof(char[1 - 2*!!(cond)]))
 
-#define BUG()                  do { assert(0); unreachable(); } while (0)
+#define BUG()                  do { fflush(stdout); assert(0); unreachable(); } while (0)
 #define BUG_ON(cond)           assert(!(cond))
 
 #define WARN(cond, fmt, ...)                                           \
diff --git a/include/linux/errname.h b/include/linux/errname.h
new file mode 100644 (file)
index 0000000..443d504
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _LINUX_ERRNAME_H
+#define _LINUX_ERRNAME_H
+
+#include <string.h>
+
+static inline const char *errname(int err)
+{
+       return strerror(abs(err));
+}
+
+#endif /* _LINUX_ERRNAME_H */
index a29d1565cfc5e2763e98b52e50b536d922300a81..cf485d78ed59eb05aa48a2e42c5fb1c04cb3418a 100644 (file)
@@ -4,6 +4,7 @@
 #define try_to_freeze()
 #define set_freezable()
 #define freezing(task)         false
-#define freezable_schedule_timeout(_t) schedule_timeout(_t);
+#define freezable_schedule()   schedule()
+#define freezable_schedule_timeout(_t) schedule_timeout(_t)
 
 #endif /* __TOOLS_LINUX_FREEZER_H */
index f09689dafb008114f9199167f6d18918d0129a9d..c74b7376990d53301bfbd55acb414dbb491d8184 100644 (file)
@@ -2,7 +2,7 @@
 #define _LINUX_GENERIC_RADIX_TREE_H
 
 /**
- * DOC: Generic radix trees/sparse arrays:
+ * DOC: Generic radix trees/sparse arrays
  *
  * Very simple and minimalistic, supporting arbitrary size entries up to
  * PAGE_SIZE.
 
 #include <asm/page.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/types.h>
 
 struct genradix_root;
 
 struct __genradix {
-       struct genradix_root __rcu      *root;
+       struct genradix_root            *root;
 };
 
 /*
@@ -115,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 #define __genradix_cast(_radix)                (typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)    sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix)                       \
+       (PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix)                      \
+       (PAGE_SIZE % sizeof((_radix)->type[0]))
+
 #define __genradix_idx_to_offset(_radix, _idx)                 \
        __idx_to_offset(_idx, __genradix_obj_size(_radix))
 
@@ -178,14 +185,30 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 #define genradix_iter_peek(_iter, _radix)                      \
        (__genradix_cast(_radix)                                \
         __genradix_iter_peek(_iter, &(_radix)->tree,           \
-                             PAGE_SIZE / __genradix_obj_size(_radix)))
+                       __genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+                               size_t, size_t);
+
+/**
+ * genradix_iter_peek - get first entry at or below iterator's current
+ *                     position
+ * @_iter:     a genradix_iter
+ * @_radix:    genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix)                 \
+       (__genradix_cast(_radix)                                \
+        __genradix_iter_peek_prev(_iter, &(_radix)->tree,      \
+                       __genradix_objs_per_page(_radix),       \
+                       __genradix_obj_size(_radix) +           \
+                       __genradix_page_remainder(_radix)))
 
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
                                           size_t obj_size)
 {
-       size_t new_offset = iter->offset + obj_size;
-
-       if (new_offset < iter->offset) {
+       if (iter->offset + obj_size < iter->offset) {
                iter->offset    = SIZE_MAX;
                iter->pos       = SIZE_MAX;
                return;
@@ -203,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)                   \
        __genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+                                         size_t obj_size)
+{
+       if (iter->offset == 0 ||
+           iter->offset == SIZE_MAX) {
+               iter->offset = SIZE_MAX;
+               return;
+       }
+
+       if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+               iter->offset -= PAGE_SIZE % obj_size;
+
+       iter->offset -= obj_size;
+       iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix)                    \
+       __genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
 #define genradix_for_each_from(_radix, _iter, _p, _start)      \
        for (_iter = genradix_iter_init(_radix, _start);        \
             (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
@@ -220,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_for_each(_radix, _iter, _p)                   \
        genradix_for_each_from(_radix, _iter, _p, 0)
 
+#define genradix_last_pos(_radix)                              \
+       (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix:    genradix to iterate over
+ * @_iter:     a genradix_iter to track current position
+ * @_p:                pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p)           \
+       for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\
+            (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+            genradix_iter_rewind(&_iter, _radix))
+
 int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
 
 /**
index fe928265ce8b9624caddc8e59457e24d45765946..4fd3b68d4cfe63cca48e8b27c01310c47735157b 100644 (file)
@@ -43,6 +43,8 @@
        (time_after_eq64(a, b) && \
         time_before_eq64(a, c))
 
+#define time_is_before_jiffies(a) time_after(jiffies, a)
+
 #define HZ             1000
 
 static inline u64 jiffies_to_nsecs(const unsigned long j)
@@ -79,6 +81,11 @@ static inline u64 local_clock(void)
        return sched_clock();
 }
 
+static inline u64 ktime_get_ns(void)
+{
+       return sched_clock();
+}
+
 #define jiffies                        nsecs_to_jiffies(sched_clock())
 
 #endif
index 30451cb94073d7fa6e998be44b4864737bce53b3..d31b5f5622c255923635086b477e7c2375ce0927 100644 (file)
@@ -228,6 +228,17 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *
        return kstrtoint(s, base, res);
 }
 
+struct printbuf;
+extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args);
+extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);
+
+static const char hex_asc[] = "0123456789abcdef";
+#define hex_asc_lo(x)  hex_asc[((x) & 0x0f)]
+#define hex_asc_hi(x)  hex_asc[((x) & 0xf0) >> 4]
+static const char hex_asc_upper[] = "0123456789ABCDEF";
+#define hex_asc_upper_lo(x)    hex_asc_upper[((x) & 0x0f)]
+#define hex_asc_upper_hi(x)    hex_asc_upper[((x) & 0xf0) >> 4]
+
 /* The hash is always the low bits of hash_len */
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  #define HASH_LEN_DECLARE u32 hash; u32 len
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644 (file)
index 0000000..6a3cd1b
--- /dev/null
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void) __init;
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+                          gfp_t gfp) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                 gfp_t gfp) __ref;
+extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+                            gfp_t gfp) __ref;
+extern void kmemleak_free(const void *ptr) __ref;
+extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
+extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
+extern void kmemleak_update_trace(const void *ptr) __ref;
+extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_ignore(const void *ptr) __ref;
+extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
+extern void kmemleak_no_scan(const void *ptr) __ref;
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+                               gfp_t gfp) __ref;
+extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
+extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+                                           int min_count, slab_flags_t flags,
+                                           gfp_t gfp)
+{
+       if (!(flags & SLAB_NOLEAKTRACE))
+               kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+       if (!(flags & SLAB_NOLEAKTRACE))
+               kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+       *ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+                                 gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+                                           int min_count, slab_flags_t flags,
+                                           gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                        gfp_t gfp)
+{
+}
+static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+                                   gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_part(const void *ptr, size_t size)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+}
+static inline void kmemleak_free_percpu(const void __percpu *ptr)
+{
+}
+static inline void kmemleak_update_trace(const void *ptr)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+                                      gfp_t gfp)
+{
+}
+static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
+{
+}
+static inline void kmemleak_ignore_phys(phys_addr_t phys)
+{
+}
+
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#endif /* __KMEMLEAK_H */
index c7362d630de6a1d68a9790734e3d5780414297a6..c33b21267e21f75ad763706ce9782c76c7ae90f9 100644 (file)
@@ -29,7 +29,7 @@ struct kset;
 struct kobj_type {
        void (*release)(struct kobject *kobj);
        const struct sysfs_ops *sysfs_ops;
-       struct attribute **default_attrs;
+       const struct attribute_group **default_groups;
        const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
        const void *(*namespace)(struct kobject *kobj);
 };
@@ -48,7 +48,7 @@ struct kobj_attribute {
 struct kobject {
        struct kobject          *parent;
        struct kset             *kset;
-       struct kobj_type        *ktype;
+       const struct kobj_type  *ktype;
        struct kernfs_node      *sd; /* sysfs directory entry */
        atomic_t                ref;
        unsigned int state_initialized:1;
@@ -64,7 +64,7 @@ struct kset {
 
 #define kobject_add(...)       0
 
-static inline void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+static inline void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
 {
        memset(kobj, 0, sizeof(*kobj));
 
@@ -77,7 +77,7 @@ static inline void kobject_del(struct kobject *kobj);
 
 static inline void kobject_cleanup(struct kobject *kobj)
 {
-       struct kobj_type *t = kobj->ktype;
+       const struct kobj_type *t = kobj->ktype;
 
        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs)
index 3639dc997ed43de7ca1359ae3dc5dee299937083..dcc4745f8d2a4cca213a947c835899af92727352 100644 (file)
@@ -10,6 +10,7 @@
 #define list_add(n, h)                 cds_list_add(n, h)
 #define list_add_tail(n, h)            cds_list_add_tail(n, h)
 #define __list_del_entry(l)            cds_list_del(l)
+#define __list_del(p, n)               __cds_list_del(p, n)
 #define list_del(l)                    cds_list_del(l)
 #define list_del_init(l)               cds_list_del_init(l)
 #define list_replace(o, n)             cds_list_replace(o, n)
diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h
new file mode 100644 (file)
index 0000000..3d62abe
--- /dev/null
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math64.h>
+#include <linux/printbuf.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+typedef unsigned __int128 u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+       return (u128)a;
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+       return (u64)a;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+       return (u64)(a >> 64);
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+       return a + b;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+       return a - b;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+       return i << shift;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+       return ((u128)a << 64) + b;
+}
+
+static inline u128 u128_square(u64 i)
+{
+       return i*i;
+}
+
+#else
+
+typedef struct {
+       u64 hi, lo;
+} u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+       return (u128){ .lo = a };
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+       return a.lo;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+       return a.hi;
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+       u128 c;
+
+       c.lo = a.lo + b.lo;
+       c.hi = a.hi + b.hi + (c.lo < a.lo);
+       return c;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+       u128 c;
+
+       c.lo = a.lo - b.lo;
+       c.hi = a.hi - b.hi - (c.lo > a.lo);
+       return c;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+       u128 r;
+
+       r.lo = i.lo << shift;
+       if (shift < 64)
+               r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+       else {
+               r.hi = i.lo << (shift - 64);
+               r.lo = 0;
+       }
+       return r;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+       return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
+}
+
+static inline u128 u128_square(u64 i)
+{
+       u128 r;
+       u64  h = i >> 32, l = i & (u64)U32_MAX;
+
+       r =             u128_shl(u64_to_u128(h*h), 64);
+       r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+       r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+       r = u128_add(r,          u64_to_u128(l*l));
+       return r;
+}
+
+#endif
+
+static inline u128 u128_div(u128 n, u64 d)
+{
+       u128 r;
+       u64 rem;
+       u64 hi = u128_shr64_to_u64(n);
+       u64 lo = u128_to_u64(n);
+       u64  h =  hi & ((u64)U32_MAX  << 32);
+       u64  l = (hi &  (u64)U32_MAX) << 32;
+
+       r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+       r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+       r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+       return r;
+}
+
+struct mean_and_variance {
+       s64 n;
+       s64 sum;
+       u128 sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+       bool init;
+       u8 w;
+       s64 mean;
+       u64 variance;
+};
+
+inline s64 fast_divpow2(s64 n, u8 d);
+
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
+       s64              mean_and_variance_get_mean(struct mean_and_variance s);
+       u64              mean_and_variance_get_variance(struct mean_and_variance s1);
+       u32              mean_and_variance_get_stddev(struct mean_and_variance s);
+
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
+       s64                       mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+       u64                       mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+       u32                       mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/include/linux/mm.h b/include/linux/mm.h
new file mode 100644 (file)
index 0000000..4bf80ba
--- /dev/null
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_MM_H
+#define _TOOLS_LINUX_MM_H
+
+#include <linux/types.h>
+
+struct sysinfo {
+       long uptime;            /* Seconds since boot */
+       unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
+       unsigned long totalram; /* Total usable main memory size */
+       unsigned long freeram;  /* Available memory size */
+       unsigned long sharedram;        /* Amount of shared memory */
+       unsigned long bufferram;        /* Memory used by buffers */
+       unsigned long totalswap;        /* Total swap space size */
+       unsigned long freeswap; /* swap space still available */
+       __u16 procs;                    /* Number of current processes */
+       __u16 pad;                      /* Explicit padding for m68k */
+       unsigned long totalhigh;        /* Total high memory size */
+       unsigned long freehigh; /* Available high memory size */
+       __u32 mem_unit;                 /* Memory unit size in bytes */
+};
+
+extern void si_meminfo(struct sysinfo * val);
+
+#endif /* _TOOLS_LINUX_MM_H */
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
new file mode 100644 (file)
index 0000000..6f177cd
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef _LINUX_PRANDOM_H
+#define _LINUX_PRANDOM_H
+
+#include <linux/random.h>
+
+static inline void prandom_bytes(void *buf, int nbytes)
+{
+       return get_random_bytes(buf, nbytes);
+}
+
+#define prandom_type(type)                             \
+static inline type prandom_##type(void)                        \
+{                                                      \
+       type v;                                         \
+                                                       \
+       prandom_bytes(&v, sizeof(v));                   \
+       return v;                                       \
+}
+
+prandom_type(int);
+prandom_type(long);
+prandom_type(u32);
+prandom_type(u64);
+#undef prandom_type
+
+#endif /* _LINUX_PRANDOM_H */
+
index 13cb826d1fa27ace5bd14e14c016742dd740ef49..b14fbe93664983f632ebe73c469ccda675ea4d42 100644 (file)
@@ -4,4 +4,7 @@
 #define prefetch(p)    \
        ({ __maybe_unused typeof(p) __var = (p); })
 
+#define prefetchw(p)   \
+       ({ __maybe_unused typeof(p) __var = (p); })
+
 #endif /* _LINUX_PREFETCH_H */
diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h
new file mode 100644 (file)
index 0000000..f39d8ed
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _LINUX_PRETTY_PRINTERS_H
+#define _LINUX_PRETTY_PRINTERS_H
+
+void prt_string_option(struct printbuf *, const char * const[], size_t);
+void prt_bitflags(struct printbuf *, const char * const[], u64);
+
+#endif /* _LINUX_PRETTY_PRINTERS_H */
diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h
new file mode 100644 (file)
index 0000000..24e62e5
--- /dev/null
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _LINUX_PRINTBUF_H
+#define _LINUX_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ *   struct printbuf buf = PRINTBUF;
+ *
+ *   prt_printf(&buf, "foo=");
+ *   foo_to_text(&buf, foo);
+ *   printk("%s", buf.buf);
+ *   printbuf_exit(&buf);
+ *
+ * Or
+ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+       PRINTBUF_UNITS_2,       /* use binary powers of 2^10 */
+       PRINTBUF_UNITS_10,      /* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS       4
+
+struct printbuf {
+       char                    *buf;
+       unsigned                size;
+       unsigned                pos;
+       unsigned                last_newline;
+       unsigned                last_field;
+       unsigned                indent;
+       /*
+        * If nonzero, allocations will be done with GFP_ATOMIC:
+        */
+       u8                      atomic;
+       bool                    allocation_failure:1;
+       bool                    heap_allocated:1;
+       enum printbuf_si        si_units:1;
+       bool                    human_readable_units:1;
+       bool                    has_indent_or_tabstops:1;
+       bool                    suppress_indent_tabstop_handling:1;
+       u8                      nr_tabstops;
+
+       /*
+        * Do not modify directly: use printbuf_tabstop_add(),
+        * printbuf_tabstop_get()
+        */
+       u8                      cur_tabstop;
+       u8                      _tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int printbuf_make_room(struct printbuf *, unsigned);
+const char *printbuf_str(const struct printbuf *);
+void printbuf_exit(struct printbuf *);
+
+void printbuf_tabstops_reset(struct printbuf *);
+void printbuf_tabstop_pop(struct printbuf *);
+int printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void printbuf_indent_add(struct printbuf *, unsigned);
+void printbuf_indent_sub(struct printbuf *, unsigned);
+
+void prt_newline(struct printbuf *);
+void prt_tab(struct printbuf *);
+void prt_tab_rjust(struct printbuf *);
+
+void prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void prt_human_readable_u64(struct printbuf *, u64);
+void prt_human_readable_s64(struct printbuf *, s64);
+void prt_units_u64(struct printbuf *, u64);
+void prt_units_s64(struct printbuf *, s64);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size)                   \
+((struct printbuf) {                                   \
+       .buf    = _buf,                                 \
+       .size   = _size,                                \
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+       return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+       return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+       return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+       return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+       printbuf_make_room(out, 1);
+
+       if (out->pos < out->size)
+               out->buf[out->pos] = 0;
+       else if (out->size)
+               out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+       if (printbuf_remaining(out))
+               out->buf[out->pos] = c;
+       out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+       printbuf_make_room(out, 1);
+       __prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+       __prt_char(out, c);
+       printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+       unsigned i, can_print = min(n, printbuf_remaining(out));
+
+       for (i = 0; i < can_print; i++)
+               out->buf[out->pos++] = c;
+       out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+       printbuf_make_room(out, n);
+       __prt_chars_reserved(out, c, n);
+       printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+       unsigned i, can_print;
+
+       printbuf_make_room(out, n);
+
+       can_print = min(n, printbuf_remaining(out));
+
+       for (i = 0; i < can_print; i++)
+               out->buf[out->pos++] = ((char *) b)[i];
+       out->pos += n - can_print;
+
+       printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+       prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+       prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+       printbuf_make_room(out, 2);
+       __prt_char_reserved(out, hex_asc_hi(byte));
+       __prt_char_reserved(out, hex_asc_lo(byte));
+       printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+       printbuf_make_room(out, 2);
+       __prt_char_reserved(out, hex_asc_upper_hi(byte));
+       __prt_char_reserved(out, hex_asc_upper_lo(byte));
+       printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+       buf->pos                = 0;
+       buf->allocation_failure = 0;
+       buf->indent             = 0;
+       buf->nr_tabstops        = 0;
+       buf->cur_tabstop        = 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+       buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+       buf->atomic--;
+}
+
+/*
+ * This is used for the %pf(%p) sprintf format extension, where we pass a pretty
+ * printer and arguments to the pretty-printer to sprintf
+ *
+ * Instead of passing a pretty-printer function to sprintf directly, we pass it
+ * a pointer to a struct call_pp, so that sprintf can check that the magic
+ * number is present, which in turn ensures that the CALL_PP() macro has been
+ * used in order to typecheck the arguments to the pretty printer function
+ *
+ * Example usage:
+ *   sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev));
+ */
+struct call_pp {
+       unsigned long   magic;
+       void            *fn;
+};
+
+#define PP_TYPECHECK(fn, ...)                                  \
+       ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); })
+
+#define CALL_PP_MAGIC          (unsigned long) 0xce0b92d22f6b6be4
+
+#define CALL_PP(fn, ...)                                       \
+       (PP_TYPECHECK(fn, ##__VA_ARGS__),                       \
+        &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__
+
+#endif /* _LINUX_PRINTBUF_H */
index bc1619f7cbf765e184d4d80a2cd83a876c4b68e1..df9c1920ddfaf14f47a82a4408a3e589e8bf72ec 100644 (file)
@@ -5,6 +5,7 @@
 #define pr_fmt(fmt) fmt
 #endif
 
+#include <linux/compiler.h>
 #include <stdarg.h>
 #include <stdio.h>
 
@@ -169,7 +170,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
  * ratelimited messages with local ratelimit_state,
  * no local ratelimit_state used in the !PRINTK case
  */
-#ifdef CONFIG_PRINTK
 #define printk_ratelimited(fmt, ...)                                   \
 ({                                                                     \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
@@ -179,10 +179,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
        if (__ratelimit(&_rs))                                          \
                printk(fmt, ##__VA_ARGS__);                             \
 })
-#else
-#define printk_ratelimited(fmt, ...)                                   \
-       no_printk(fmt, ##__VA_ARGS__)
-#endif
 
 #define pr_emerg_ratelimited(fmt, ...)                                 \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
index 28c595a0c0cfe3429ff69c19fa83edb424d7d38c..ea101d53acd6003572ebcddf88db95a1deb1ba7c 100644 (file)
@@ -29,11 +29,6 @@ static inline void get_random_bytes(void *buf, int nbytes)
        BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
 }
 
-static inline void prandom_bytes(void *buf, int nbytes)
-{
-       return get_random_bytes(buf, nbytes);
-}
-
 #define get_random_type(type)                          \
 static inline type get_random_##type(void)             \
 {                                                      \
index 9d70e6e226ffbee5719d142a8e22d1c765412007..f851d6a2f2b7306df3d9d28ae5d0c524089d42e0 100644 (file)
@@ -19,6 +19,7 @@ static inline void init_rwsem(struct rw_semaphore *lock)
 }
 
 #define down_read(l)           pthread_rwlock_rdlock(&(l)->lock)
+#define down_read_killable(l)  (pthread_rwlock_rdlock(&(l)->lock), 0)
 #define down_read_trylock(l)   (!pthread_rwlock_tryrdlock(&(l)->lock))
 #define up_read(l)             pthread_rwlock_unlock(&(l)->lock)
 
index 48d20e29a1f334606c433f5220c051dd27444a31..ac6d27bb6b3bb3079c399330d7ec965408482676 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/bug.h>
 #include <linux/completion.h>
 #include <linux/jiffies.h>
+#include <linux/rwsem.h>
 #include <linux/time64.h>
 
 #define TASK_RUNNING           0
@@ -88,6 +89,10 @@ struct task_struct {
        pid_t                   pid;
 
        struct bio_list         *bio_list;
+
+       struct signal_struct    {
+               struct rw_semaphore exec_update_lock;
+       }                       *signal, _signal;
 };
 
 extern __thread struct task_struct *current;
@@ -157,4 +162,11 @@ static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 #define current_kernel_time64()        current_kernel_time()
 #define CURRENT_TIME           (current_kernel_time())
 
+static inline unsigned int stack_trace_save_tsk(struct task_struct *task,
+                                 unsigned long *store, unsigned int size,
+                                 unsigned int skipnr)
+{
+       return 0;
+}
+
 #endif /* __TOOLS_LINUX_SCHED_H */
index 626b768cda969e248202965f8638b1a10ac363e8..ebbab7a68c925a444c910922a97b338492b23cb0 100644 (file)
@@ -11,20 +11,22 @@ struct shrink_control {
 
 #define SHRINK_STOP (~0UL)
 
+struct printbuf;
 struct shrinker {
        unsigned long (*count_objects)(struct shrinker *,
                                       struct shrink_control *sc);
        unsigned long (*scan_objects)(struct shrinker *,
                                      struct shrink_control *sc);
+       void (*to_text)(struct printbuf *, struct shrinker *);
 
        int seeks;      /* seeks to recreate an obj */
        long batch;     /* reclaim batch size, 0 = default */
        struct list_head list;
 };
 
-int register_shrinker(struct shrinker *);
+int register_shrinker(struct shrinker *, const char *, ...);
 void unregister_shrinker(struct shrinker *);
 
-void run_shrinkers(void);
+void run_shrinkers(gfp_t gfp_mask, bool);
 
 #endif /* __TOOLS_LINUX_SHRINKER_H */
index 477c33eb00d7dca36ad07cde09cd5681455cf6b2..362a577b968e9da0eb5a54578934ed0dd5963fc0 100644 (file)
@@ -59,7 +59,6 @@
  */
 
 #include <linux/lockdep.h>
-#include <linux/osq_lock.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 
@@ -105,18 +104,25 @@ enum six_lock_type {
 
 struct six_lock {
        union six_lock_state    state;
-       unsigned                intent_lock_recurse;
        struct task_struct      *owner;
-       struct optimistic_spin_queue osq;
        unsigned __percpu       *readers;
-
+       unsigned                intent_lock_recurse;
+       unsigned long           ip;
        raw_spinlock_t          wait_lock;
-       struct list_head        wait_list[2];
+       struct list_head        wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map      dep_map;
 #endif
 };
 
+struct six_lock_waiter {
+       struct list_head        list;
+       struct task_struct      *task;
+       enum six_lock_type      lock_want;
+       bool                    lock_acquired;
+       u64                     start_time;
+};
+
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
 
 static __always_inline void __six_lock_init(struct six_lock *lock,
@@ -125,8 +131,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
 {
        atomic64_set(&lock->state.counter, 0);
        raw_spin_lock_init(&lock->wait_lock);
-       INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
-       INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+       INIT_LIST_HEAD(&lock->wait_list);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        debug_check_no_locks_freed((void *) lock, sizeof(*lock));
        lockdep_init_map(&lock->dep_map, name, key, 0);
@@ -146,6 +151,8 @@ do {                                                                        \
 bool six_trylock_##type(struct six_lock *);                            \
 bool six_relock_##type(struct six_lock *, u32);                                \
 int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *,        \
+                          six_lock_should_sleep_fn, void *);           \
 void six_unlock_##type(struct six_lock *);
 
 __SIX_LOCK(read)
@@ -182,6 +189,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
        SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
 }
 
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+                               struct six_lock_waiter *wait,
+                               six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
+}
+
 static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
        SIX_LOCK_DISPATCH(type, six_unlock, lock);
@@ -196,8 +210,13 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
-void six_lock_pcpu_free_rcu(struct six_lock *);
 void six_lock_pcpu_free(struct six_lock *);
 void six_lock_pcpu_alloc(struct six_lock *);
 
+struct six_lock_count {
+       unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
 #endif /* _LINUX_SIX_H */
index bc99973fccd22059805f43d99fbda4f9e7daefaf..17fe235eef1786c603cba131ebd9e83480db4b27 100644 (file)
@@ -7,10 +7,14 @@
 
 #include <linux/kernel.h>
 #include <linux/log2.h>
+#include <linux/overflow.h>
 #include <linux/page.h>
 #include <linux/shrinker.h>
 #include <linux/types.h>
 
+#include <stdlib.h>
+#include <sys/mman.h>
+
 #define ARCH_KMALLOC_MINALIGN          16
 #define KMALLOC_MAX_SIZE               SIZE_MAX
 
@@ -20,7 +24,7 @@ static inline void *kmalloc(size_t size, gfp_t flags)
        void *p;
 
        do {
-               run_shrinkers();
+               run_shrinkers(flags, i != 0);
 
                if (size) {
                        size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
@@ -58,6 +62,16 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
        return new;
 }
 
+static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+{
+       size_t bytes;
+
+       if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
+               return NULL;
+
+       return krealloc(p, bytes, flags);
+}
+
 #define kzalloc(size, flags)           kmalloc(size, flags|__GFP_ZERO)
 #define kmalloc_array(n, size, flags)                                  \
        ((size) != 0 && (n) > SIZE_MAX / (size)                         \
@@ -83,7 +97,7 @@ static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
        void *p;
 
        do {
-               run_shrinkers();
+               run_shrinkers(flags, i != 0);
 
                p = aligned_alloc(PAGE_SIZE, size);
                if (p && (flags & __GFP_ZERO))
@@ -174,4 +188,53 @@ static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
 
 #define KMEM_CACHE(_struct, _flags)    kmem_cache_create(sizeof(struct _struct))
 
+#define PAGE_KERNEL            0
+#define PAGE_KERNEL_EXEC       1
+
+#define vfree(p)               free(p)
+
+static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+{
+       unsigned i = 0;
+       void *p;
+
+       size = round_up(size, PAGE_SIZE);
+
+       do {
+               run_shrinkers(gfp_mask, i != 0);
+
+               p = aligned_alloc(PAGE_SIZE, size);
+               if (p && gfp_mask & __GFP_ZERO)
+                       memset(p, 0, size);
+       } while (!p && i++ < 10);
+
+       return p;
+}
+
+static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
+{
+       void *p;
+
+       p = __vmalloc(size, gfp_mask);
+       if (!p)
+               return NULL;
+
+       if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+               vfree(p);
+               return NULL;
+       }
+
+       return p;
+}
+
+static inline void *vmalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL);
+}
+
+static inline void *vzalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL|__GFP_ZERO);
+}
+
 #endif /* __TOOLS_LINUX_SLAB_H */
index c9be6b61028a7d6a915dfa6a8639e6146273fcee..6c4a623c267182d6362ce6a9a1843c230a67fdad 100644 (file)
@@ -2,27 +2,32 @@
 #define __TOOLS_LINUX_SPINLOCK_H
 
 #include <linux/atomic.h>
+#include <pthread.h>
 
 typedef struct {
-       int             count;
+       pthread_mutex_t lock;
 } raw_spinlock_t;
 
-#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .count = 0 }
+#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
 
 static inline void raw_spin_lock_init(raw_spinlock_t *lock)
 {
-       smp_store_release(&lock->count, 0);
+       pthread_mutex_init(&lock->lock, NULL);
+}
+
+static inline bool raw_spin_trylock(raw_spinlock_t *lock)
+{
+       return !pthread_mutex_trylock(&lock->lock);
 }
 
 static inline void raw_spin_lock(raw_spinlock_t *lock)
 {
-       while (xchg_acquire(&lock->count, 1))
-               ;
+       pthread_mutex_lock(&lock->lock);
 }
 
 static inline void raw_spin_unlock(raw_spinlock_t *lock)
 {
-       smp_store_release(&lock->count, 0);
+       pthread_mutex_unlock(&lock->lock);
 }
 
 #define raw_spin_lock_irq(lock)                raw_spin_lock(lock)
index b5e00a092f420f89f1237fa3b908ad8f8b82bbc1..3ceda3a3a6698884137a48bdeef4acafb3ac565a 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/types.h>       /* for size_t */
 
 extern size_t strlcpy(char *dest, const char *src, size_t size);
+extern ssize_t strscpy(char *dest, const char *src, size_t count);
 extern char *strim(char *);
 extern void memzero_explicit(void *, size_t);
 int match_string(const char * const *, size_t, const char *);
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
new file mode 100644 (file)
index 0000000..af58770
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_STRING_HELPERS_H_
+#define _LINUX_STRING_HELPERS_H_
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+
+/* Descriptions of the types of units to
+ * print in */
+enum string_size_units {
+       STRING_UNITS_10,        /* use powers of 10^3 (standard SI) */
+       STRING_UNITS_2,         /* use binary powers of 2^10 */
+};
+
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+                   char *buf, int len);
+
+#endif
index 3ba2f48a016eea6025234e202250a2409c0d0916..cb75d88bd3ac355b3b8fcaabc73596ec65786d1b 100644 (file)
@@ -10,6 +10,10 @@ struct attribute {
        umode_t                 mode;
 };
 
+struct attribute_group {
+       struct attribute        **attrs;
+};
+
 struct sysfs_ops {
        ssize_t (*show)(struct kobject *, struct attribute *, char *);
        ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
index 77f967377757530f64238075483a4e6e6a3df365..fc05e23afe379218b123b9e28102db8b7f7ce4b7 100644 (file)
@@ -6,6 +6,7 @@
 #include <stdint.h>
 
 #include <fcntl.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 
 #define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
@@ -24,7 +25,6 @@ typedef unsigned short                umode_t;
 
 typedef unsigned gfp_t;
 
-#define GFP_KERNEL     0
 #define GFP_ATOMIC     0
 #define GFP_NOFS       0
 #define GFP_NOIO       0
@@ -35,6 +35,7 @@ typedef unsigned gfp_t;
 #define __GFP_NORETRY  0
 #define __GFP_NOFAIL   0
 #define __GFP_ZERO     1
+#define GFP_KERNEL     2
 
 #define PAGE_ALLOC_COSTLY_ORDER        6
 
@@ -78,4 +79,8 @@ typedef u64 sector_t;
 
 typedef int (*cmp_func_t)(const void *a, const void *b);
 
+typedef unsigned int __bitwise slab_flags_t;
+typedef u64 phys_addr_t;
+struct vm_struct;
+
 #endif /* _TOOLS_LINUX_TYPES_H_ */
index ccb319eb52a4a444db0f5981f23a7acfa92bc2f7..55fffb5991e0e402a2665a6888018fb10f9bae0d 100644 (file)
@@ -1,59 +1,6 @@
 #ifndef __TOOLS_LINUX_VMALLOC_H
 #define __TOOLS_LINUX_VMALLOC_H
 
-#include <stdlib.h>
-#include <sys/mman.h>
-
 #include "linux/slab.h"
-#include "tools-util.h"
-
-#define PAGE_KERNEL            0
-#define PAGE_KERNEL_EXEC       1
-
-#define vfree(p)               free(p)
-
-static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
-{
-       unsigned i = 0;
-       void *p;
-
-       size = round_up(size, PAGE_SIZE);
-
-       do {
-               run_shrinkers();
-
-               p = aligned_alloc(PAGE_SIZE, size);
-               if (p && gfp_mask & __GFP_ZERO)
-                       memset(p, 0, size);
-       } while (!p && i++ < 10);
-
-       return p;
-}
-
-static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
-{
-       void *p;
-
-       p = __vmalloc(size, gfp_mask);
-       if (!p)
-               return NULL;
-
-       if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
-               vfree(p);
-               return NULL;
-       }
-
-       return p;
-}
-
-static inline void *vmalloc(unsigned long size)
-{
-       return __vmalloc(size, GFP_KERNEL);
-}
-
-static inline void *vzalloc(unsigned long size)
-{
-       return __vmalloc(size, GFP_KERNEL|__GFP_ZERO);
-}
 
 #endif /* __TOOLS_LINUX_VMALLOC_H */
index 0dd1b0230293847cec379b60e8afe0e6a656d037..b0fa1eda5acb7c66fe2e83407b0fa0bf6c8865fd 100644 (file)
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd) and
+ * the GPLv2 (found in the COPYING file in the root directory of
+ * https://github.com/facebook/zstd). You may select, at your option, one of the
+ * above-listed licenses.
+ */
+
+#ifndef LINUX_ZSTD_H
+#define LINUX_ZSTD_H
+
+/**
+ * This is a kernel-style API that wraps the upstream zstd API, which cannot be
+ * used directly because the symbols aren't exported. It exposes the minimal
+ * functionality which is currently required by users of zstd in the kernel.
+ * Expose extra functions from lib/zstd/zstd.h as needed.
+ */
+
+/* ======   Dependency   ====== */
+#include <linux/types.h>
 #include <zstd.h>
+#include <linux/zstd_errors.h>
+
+/* ======   Helper Functions   ====== */
+/**
+ * zstd_compress_bound() - maximum compressed size in worst case scenario
+ * @src_size: The size of the data to compress.
+ *
+ * Return:    The maximum compressed size in the worst case scenario.
+ */
+size_t zstd_compress_bound(size_t src_size);
+
+/**
+ * zstd_is_error() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+unsigned int zstd_is_error(size_t code);
+
+/**
+ * enum zstd_error_code - zstd error codes
+ */
+typedef ZSTD_ErrorCode zstd_error_code;
+
+/**
+ * zstd_get_error_code() - translates an error function result to an error code
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: A unique error code for this error.
+ */
+zstd_error_code zstd_get_error_code(size_t code);
+
+/**
+ * zstd_get_error_name() - translates an error function result to a string
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: An error string corresponding to the error code.
+ */
+const char *zstd_get_error_name(size_t code);
+
+/**
+ * zstd_min_clevel() - minimum allowed compression level
+ *
+ * Return: The minimum allowed compression level.
+ */
+int zstd_min_clevel(void);
+
+/**
+ * zstd_max_clevel() - maximum allowed compression level
+ *
+ * Return: The maximum allowed compression level.
+ */
+int zstd_max_clevel(void);
+
+/* ======   Parameter Selection   ====== */
+
+/**
+ * enum zstd_strategy - zstd compression search strategy
+ *
+ * From faster to stronger. See zstd_lib.h.
+ */
+typedef ZSTD_strategy zstd_strategy;
+
+/**
+ * struct zstd_compression_parameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression,
+ *                slower, and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_compressionParameters zstd_compression_parameters;
+
+/**
+ * struct zstd_frame_parameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the
+ *                   frame header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the
+ *                   end of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame
+ *                   header when using dictionary compression.
+ *
+ * The default value is all fields set to 0. See zstd_lib.h.
+ */
+typedef ZSTD_frameParameters zstd_frame_parameters;
+
+/**
+ * struct zstd_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef ZSTD_parameters zstd_parameters;
+
+/**
+ * zstd_get_params() - returns zstd_parameters for selected level
+ * @level:              The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ *                      if unknown.
+ *
+ * Return:              The selected zstd_parameters.
+ */
+zstd_parameters zstd_get_params(int level,
+       unsigned long long estimated_src_size);
+
+/* ======   Single-pass Compression   ====== */
+
+typedef ZSTD_CCtx zstd_cctx;
+
+/**
+ * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+ * @parameters: The compression parameters to be used.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * zstd_cctx_workspace_bound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:      A lower bound on the size of the workspace that is passed to
+ *              zstd_init_cctx().
+ */
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+
+/**
+ * zstd_init_cctx() - initialize a zstd compression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd compression context or NULL on error.
+ */
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_compress_cctx() - compress src into dst with the initialized parameters
+ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
+ * @dst:          The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ *                ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:          The data to compress.
+ * @src_size:     The size of the data to compress.
+ * @parameters:   The compression parameters to be used.
+ *
+ * Return:        The compressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+       const void *src, size_t src_size, const zstd_parameters *parameters);
+
+/* ======   Single-pass Decompression   ====== */
+
+typedef ZSTD_DCtx zstd_dctx;
+
+/**
+ * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         zstd_init_dctx().
+ */
+size_t zstd_dctx_workspace_bound(void);
+
+/**
+ * zstd_init_dctx() - initialize a zstd decompression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd decompression context or NULL on error.
+ */
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_decompress_dctx() - decompress zstd compressed src into dst
+ * @dctx:         The decompression context.
+ * @dst:          The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ *                as the decompressed size. If the caller cannot upper bound the
+ *                decompressed size, then it's better to use the streaming API.
+ * @src:          The zstd compressed data to decompress. Multiple concatenated
+ *                frames and skippable frames are allowed.
+ * @src_size:     The exact size of the data to decompress.
+ *
+ * Return:        The decompressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+       const void *src, size_t src_size);
+
+/* ======   Streaming Buffers   ====== */
+
+/**
+ * struct zstd_in_buffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_inBuffer zstd_in_buffer;
+
+/**
+ * struct zstd_out_buffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_outBuffer zstd_out_buffer;
+
+/* ======   Streaming Compression   ====== */
+
+typedef ZSTD_CStream zstd_cstream;
+
+/**
+ * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream
+ * @cparams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           zstd_init_cstream().
+ */
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+
+/**
+ * zstd_init_cstream() - initialize a zstd streaming compression context
+ * @parameters        The zstd parameters to use for compression.
+ * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller
+ *                    must pass the source size (zero means empty source).
+ *                    Otherwise, the caller may optionally pass the source
+ *                    size, or zero if unknown.
+ * @workspace:        The workspace to emplace the context into. It must outlive
+ *                    the returned context.
+ * @workspace_size:   The size of workspace.
+ *                    Use zstd_cstream_workspace_bound(params->cparams) to
+ *                    determine how large the workspace must be.
+ *
+ * Return:            The zstd streaming compression context or NULL on error.
+ */
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+       unsigned long long pledged_src_size, void *workspace, size_t workspace_size);
+
+/**
+ * zstd_reset_cstream() - reset the context using parameters from creation
+ * @cstream:          The zstd streaming compression context to reset.
+ * @pledged_src_size: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledged_src_size` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:            Zero or an error, which can be checked using
+ *                    zstd_is_error().
+ */
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+       unsigned long long pledged_src_size);
+
+/**
+ * zstd_compress_stream() - streaming compress some of input into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ * @input:   Source buffer. `input->pos` is updated to indicate how much data
+ *           was read. Note that it may not consume the entire input, in which
+ *           case `input->pos < input->size`, and it's up to the caller to
+ *           present remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:   A hint for the number of bytes to use as the input for the next
+ *           function call or an error, which can be checked using
+ *           zstd_is_error().
+ */
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+       zstd_in_buffer *input);
+
+/**
+ * zstd_flush_stream() - flush internal buffers into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_flush_stream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since zstd_flush_stream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/**
+ * zstd_end_stream() - flush internal buffers into output and end the frame
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_end_stream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/* ======   Streaming Decompression   ====== */
+
+typedef ZSTD_DStream zstd_dstream;
+
+/**
+ * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ *
+ * Return:           A lower bound on the size of the workspace that is passed
+ *                   to zstd_init_dstream().
+ */
+size_t zstd_dstream_workspace_bound(size_t max_window_size);
+
+/**
+ * zstd_init_dstream() - initialize a zstd streaming decompression context
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ * @workspace:       The workspace to emplace the context into. It must outlive
+ *                   the returned context.
+ * @workspaceSize:   The size of workspace.
+ *                   Use zstd_dstream_workspace_bound(max_window_size) to
+ *                   determine how large the workspace must be.
+ *
+ * Return:           The zstd streaming decompression context.
+ */
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+       size_t workspace_size);
+
+/**
+ * zstd_reset_dstream() - reset the context using parameters from creation
+ * @dstream: The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return:   Zero or an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_reset_dstream(zstd_dstream *dstream);
+
+/**
+ * zstd_decompress_stream() - streaming decompress some of input into output
+ * @dstream: The zstd streaming decompression context.
+ * @output:  Destination buffer. `output.pos` is updated to indicate how much
+ *           decompressed data was written.
+ * @input:   Source buffer. `input.pos` is updated to indicate how much data was
+ *           read. Note that it may not consume the entire input, in which case
+ *           `input.pos < input.size`, and it's up to the caller to present
+ *           remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * zstd_decompress_stream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:   Returns 0 iff a frame is completely decoded and fully flushed.
+ *           Otherwise returns a hint for the number of bytes to use as the
+ *           input for the next function call or an error, which can be checked
+ *           using zstd_is_error(). The size hint will never load more than the
+ *           frame.
+ */
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+       zstd_in_buffer *input);
+
+/* ======   Frame Inspection Functions ====== */
+
+/**
+ * zstd_find_frame_compressed_size() - returns the size of a compressed frame
+ * @src:      Source buffer. It should point to the start of a zstd encoded
+ *            frame or a skippable frame.
+ * @src_size: The size of the source buffer. It must be at least as large as the
+ *            size of the frame.
+ *
+ * Return:    The compressed size of the frame pointed to by `src` or an error,
+ *            which can be check with zstd_is_error().
+ *            Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
 
-#define ZSTD_initDCtx(w, s)    ZSTD_initStaticDCtx(w, s)
-#define ZSTD_initCCtx(w, s)    ZSTD_initStaticCCtx(w, s)
+/**
+ * struct zstd_frame_params - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+ *                    present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @blockSizeMax:     The maximum block size.
+ * @frameType:        The frame type (zstd or skippable)
+ * @headerSize:       The size of the frame header.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_frameHeader zstd_frame_header;
 
-#define ZSTD_compressCCtx(w, dst, d_len, src, src_len, params) \
-       ZSTD_compressCCtx(w, dst, d_len, src, src_len, 0)
+/**
+ * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+ * @params:   On success the frame parameters are written here.
+ * @src:      The source buffer. It must point to a zstd or skippable frame.
+ * @src_size: The size of the source buffer.
+ *
+ * Return:    0 on success. If more data is required it returns how many bytes
+ *            must be provided to make forward progress. Otherwise it returns
+ *            an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+       size_t src_size);
 
-#define ZSTD_CCtxWorkspaceBound(p)     ZSTD_estimateCCtxSize(0)
-#define ZSTD_DCtxWorkspaceBound()      ZSTD_estimateDCtxSize()
+#endif  /* LINUX_ZSTD_H */
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
new file mode 100644 (file)
index 0000000..58b6dd4
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+
+/*===== dependency =====*/
+#include <linux/types.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#define ZSTDERRORLIB_VISIBILITY 
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+
+#endif /* ZSTD_ERRORS_H_398273423 */
index a21a39230a091b6076052007c14cdf0c872b87c1..d3d9e965e7020efb96b4eff9a03f89a19d3a88f6 100644 (file)
@@ -7,21 +7,29 @@
 
 #include <linux/tracepoint.h>
 
+#define TRACE_BPOS_entries(name)                               \
+       __field(u64,                    name##_inode    )       \
+       __field(u64,                    name##_offset   )       \
+       __field(u32,                    name##_snapshot )
+
+#define TRACE_BPOS_assign(dst, src)                            \
+       __entry->dst##_inode            = (src).inode;          \
+       __entry->dst##_offset           = (src).offset;         \
+       __entry->dst##_snapshot         = (src).snapshot
+
 DECLARE_EVENT_CLASS(bpos,
-       TP_PROTO(struct bpos *p),
+       TP_PROTO(const struct bpos *p),
        TP_ARGS(p),
 
        TP_STRUCT__entry(
-               __field(u64,    inode                           )
-               __field(u64,    offset                          )
+               TRACE_BPOS_entries(p)
        ),
 
        TP_fast_assign(
-               __entry->inode  = p->inode;
-               __entry->offset = p->offset;
+               TRACE_BPOS_assign(p, *p);
        ),
 
-       TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+       TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
 );
 
 DECLARE_EVENT_CLASS(bkey,
@@ -44,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey,
                  __entry->offset, __entry->size)
 );
 
+DECLARE_EVENT_CLASS(btree_node,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(u8,             level                   )
+               __field(u8,             btree_id                )
+               TRACE_BPOS_entries(pos)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               __entry->level          = b->c.level;
+               __entry->btree_id       = b->c.btree_id;
+               TRACE_BPOS_assign(pos, b->key.k.p);
+       ),
+
+       TP_printk("%d,%d %u %s %llu:%llu:%u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->level,
+                 bch2_btree_ids[__entry->btree_id],
+                 __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
 DECLARE_EVENT_CLASS(bch_fs,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c),
@@ -82,9 +115,29 @@ DECLARE_EVENT_CLASS(bio,
                  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
+/* super-io.c: */
+TRACE_EVENT(write_super,
+       TP_PROTO(struct bch_fs *c, unsigned long ip),
+       TP_ARGS(c, ip),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev     )
+               __field(unsigned long,  ip      )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               __entry->ip             = ip;
+       ),
+
+       TP_printk("%d,%d for %pS",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (void *) __entry->ip)
+);
+
 /* io.c: */
 
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, read_promote,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
 );
@@ -94,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce,
        TP_ARGS(bio)
 );
 
+DEFINE_EVENT(bio, read_split,
+       TP_PROTO(struct bio *bio),
+       TP_ARGS(bio)
+);
+
 DEFINE_EVENT(bio, read_retry,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
 );
 
-DEFINE_EVENT(bio, promote,
+DEFINE_EVENT(bio, read_reuse_race,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
 );
@@ -122,17 +180,21 @@ DEFINE_EVENT(bio, journal_write,
 );
 
 TRACE_EVENT(journal_reclaim_start,
-       TP_PROTO(struct bch_fs *c, u64 min_nr,
+       TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+                u64 min_nr, u64 min_key_cache,
                 u64 prereserved, u64 prereserved_total,
                 u64 btree_cache_dirty, u64 btree_cache_total,
                 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, min_nr, prereserved, prereserved_total,
+       TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
                btree_cache_dirty, btree_cache_total,
                btree_key_cache_dirty, btree_key_cache_total),
 
        TP_STRUCT__entry(
                __field(dev_t,          dev                     )
+               __field(bool,           direct                  )
+               __field(bool,           kicked                  )
                __field(u64,            min_nr                  )
+               __field(u64,            min_key_cache           )
                __field(u64,            prereserved             )
                __field(u64,            prereserved_total       )
                __field(u64,            btree_cache_dirty       )
@@ -143,7 +205,10 @@ TRACE_EVENT(journal_reclaim_start,
 
        TP_fast_assign(
                __entry->dev                    = c->dev;
+               __entry->direct                 = direct;
+               __entry->kicked                 = kicked;
                __entry->min_nr                 = min_nr;
+               __entry->min_key_cache          = min_key_cache;
                __entry->prereserved            = prereserved;
                __entry->prereserved_total      = prereserved_total;
                __entry->btree_cache_dirty      = btree_cache_dirty;
@@ -152,9 +217,12 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->direct,
+                 __entry->kicked,
                  __entry->min_nr,
+                 __entry->min_key_cache,
                  __entry->prereserved,
                  __entry->prereserved_total,
                  __entry->btree_cache_dirty,
@@ -177,7 +245,7 @@ TRACE_EVENT(journal_reclaim_finish,
                __entry->nr_flushed     = nr_flushed;
        ),
 
-       TP_printk("%d%d flushed %llu",
+       TP_printk("%d,%d flushed %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_flushed)
 );
@@ -185,44 +253,65 @@ TRACE_EVENT(journal_reclaim_finish,
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-       TP_PROTO(struct bpos *p),
+       TP_PROTO(const struct bpos *p),
        TP_ARGS(p)
 );
 
-/* Btree */
+/* Btree cache: */
 
-DECLARE_EVENT_CLASS(btree_node,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b),
+TRACE_EVENT(btree_cache_scan,
+       TP_PROTO(long nr_to_scan, long can_free, long ret),
+       TP_ARGS(nr_to_scan, can_free, ret),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u8,             level                   )
-               __field(u8,             id                      )
-               __field(u64,            inode                   )
-               __field(u64,            offset                  )
+               __field(long,   nr_to_scan              )
+               __field(long,   can_free                )
+               __field(long,   ret                     )
        ),
 
        TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->level          = b->c.level;
-               __entry->id             = b->c.btree_id;
-               __entry->inode          = b->key.k.p.inode;
-               __entry->offset         = b->key.k.p.offset;
+               __entry->nr_to_scan     = nr_to_scan;
+               __entry->can_free       = can_free;
+               __entry->ret            = ret;
        ),
 
-       TP_printk("%d,%d  %u id %u %llu:%llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->level, __entry->id,
-                 __entry->inode, __entry->offset)
+       TP_printk("scanned for %li nodes, can free %li, ret %li",
+                 __entry->nr_to_scan, __entry->can_free, __entry->ret)
 );
 
-DEFINE_EVENT(btree_node, btree_read,
+DEFINE_EVENT(btree_node, btree_cache_reap,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
 );
 
-TRACE_EVENT(btree_write,
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+       TP_PROTO(struct bch_fs *c),
+       TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
        TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
        TP_ARGS(b, bytes, sectors),
 
@@ -252,268 +341,340 @@ DEFINE_EVENT(btree_node, btree_node_free,
        TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_node_reap,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
 TRACE_EVENT(btree_reserve_get_fail,
-       TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
-       TP_ARGS(c, required, cl),
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                size_t required),
+       TP_ARGS(trans_fn, caller_ip, required),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
                __field(size_t,                 required        )
-               __field(struct closure *,       cl              )
        ),
 
        TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->required = required;
-               __entry->cl = cl;
+               strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip      = caller_ip;
+               __entry->required       = required;
        ),
 
-       TP_printk("%d,%d required %zu by %p",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->required, __entry->cl)
+       TP_printk("%s %pS required %zu",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 __entry->required)
 );
 
-DEFINE_EVENT(btree_node, btree_split,
+DEFINE_EVENT(btree_node, btree_node_compact,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_compact,
+DEFINE_EVENT(btree_node, btree_node_merge,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_merge,
+DEFINE_EVENT(btree_node, btree_node_split,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_set_root,
+DEFINE_EVENT(btree_node, btree_node_rewrite,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
 );
 
-TRACE_EVENT(btree_cache_scan,
-       TP_PROTO(unsigned long nr_to_scan_pages,
-                unsigned long nr_to_scan_nodes,
-                unsigned long can_free_nodes,
-                long ret),
-       TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+DEFINE_EVENT(btree_node, btree_node_set_root,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                struct btree_path *path,
+                unsigned level),
+       TP_ARGS(trans, caller_ip, path, level),
 
        TP_STRUCT__entry(
-               __field(unsigned long,  nr_to_scan_pages        )
-               __field(unsigned long,  nr_to_scan_nodes        )
-               __field(unsigned long,  can_free_nodes          )
-               __field(long,           ret                     )
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u8,                     level           )
+               TRACE_BPOS_entries(pos)
+               __array(char,                   node, 24        )
+               __field(u32,                    iter_lock_seq   )
+               __field(u32,                    node_lock_seq   )
        ),
 
        TP_fast_assign(
-               __entry->nr_to_scan_pages       = nr_to_scan_pages;
-               __entry->nr_to_scan_nodes       = nr_to_scan_nodes;
-               __entry->can_free_nodes         = can_free_nodes;
-               __entry->ret                    = ret;
+               struct btree *b = btree_path_node(path, level);
+
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = path->btree_id;
+               __entry->level                  = path->level;
+               TRACE_BPOS_assign(pos, path->pos);
+               if (IS_ERR(b))
+                       strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+               else
+                       scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+               __entry->iter_lock_seq          = path->l[level].lock_seq;
+               __entry->node_lock_seq          = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
        ),
 
-       TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
-                 __entry->nr_to_scan_pages,
-                 __entry->nr_to_scan_nodes,
-                 __entry->can_free_nodes,
-                 __entry->ret)
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 bch2_btree_ids[__entry->btree_id],
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->level,
+                 __entry->node,
+                 __entry->iter_lock_seq,
+                 __entry->node_lock_seq)
 );
 
-TRACE_EVENT(btree_node_relock_fail,
-       TP_PROTO(const char *trans_fn,
+TRACE_EVENT(btree_path_upgrade_fail,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned long node,
-                u32 iter_lock_seq,
-                u32 node_lock_seq),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+                struct btree_path *path,
+                unsigned level),
+       TP_ARGS(trans, caller_ip, path, level),
 
        TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
+               __array(char,                   trans_fn, 32    )
                __field(unsigned long,          caller_ip       )
                __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
-               __field(unsigned long,          node            )
+               __field(u8,                     level           )
+               TRACE_BPOS_entries(pos)
+               __field(u8,                     locked          )
+               __field(u8,                     self_read_count )
+               __field(u8,                     self_intent_count)
+               __field(u8,                     read_count      )
+               __field(u8,                     intent_count    )
                __field(u32,                    iter_lock_seq   )
                __field(u32,                    node_lock_seq   )
        ),
 
        TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               struct six_lock_count c;
+
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
-               __entry->node                   = node;
-               __entry->iter_lock_seq          = iter_lock_seq;
-               __entry->node_lock_seq          = node_lock_seq;
+               __entry->btree_id               = path->btree_id;
+               __entry->level                  = level;
+               TRACE_BPOS_assign(pos, path->pos);
+               __entry->locked                 = btree_node_locked(path, level);
+
+               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+               __entry->self_read_count        = c.n[SIX_LOCK_read];
+               __entry->self_intent_count      = c.n[SIX_LOCK_intent];
+               c = six_lock_counts(&path->l[level].b->c.lock);
+               __entry->read_count             = c.n[SIX_LOCK_read];
+               __entry->intent_count           = c.n[SIX_LOCK_read];
+               __entry->iter_lock_seq          = path->l[level].lock_seq;
+               __entry->node_lock_seq          = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
        ),
 
-       TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 __entry->btree_id,
+                 bch2_btree_ids[__entry->btree_id],
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
-                 __entry->node,
+                 __entry->level,
+                 __entry->locked,
+                 __entry->self_read_count,
+                 __entry->self_intent_count,
+                 __entry->read_count,
+                 __entry->intent_count,
                  __entry->iter_lock_seq,
                  __entry->node_lock_seq)
 );
 
 /* Garbage collection */
 
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
+DEFINE_EVENT(bch_fs, gc_gens_start,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, gc_end,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_gens_end,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
 /* Allocator */
 
-TRACE_EVENT(alloc_scan,
-       TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
-       TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
+TRACE_EVENT(bucket_alloc,
+       TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+                bool user, u64 bucket),
+       TP_ARGS(ca, alloc_reserve, user, bucket),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev             )
-               __field(u64,            found           )
-               __field(u64,            inc_gen         )
-               __field(u64,            inc_gen_skipped )
+               __field(dev_t,                  dev     )
+               __array(char,   reserve,        16      )
+               __field(bool,                   user    )
+               __field(u64,                    bucket  )
        ),
 
        TP_fast_assign(
                __entry->dev            = ca->dev;
-               __entry->found          = found;
-               __entry->inc_gen        = inc_gen;
-               __entry->inc_gen_skipped = inc_gen_skipped;
+               strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+               __entry->user           = user;
+               __entry->bucket         = bucket;
        ),
 
-       TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+       TP_printk("%d,%d reserve %s user %u bucket %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
-);
+                 __entry->reserve,
+                 __entry->user,
+                 __entry->bucket)
+);
+
+TRACE_EVENT(bucket_alloc_fail,
+       TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+                u64 free,
+                u64 avail,
+                u64 copygc_wait_amount,
+                s64 copygc_waiting_for,
+                u64 seen,
+                u64 open,
+                u64 need_journal_commit,
+                u64 nouse,
+                bool nonblocking,
+                const char *err),
+       TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+               seen, open, need_journal_commit, nouse, nonblocking, err),
 
-TRACE_EVENT(invalidate,
-       TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
-       TP_ARGS(ca, offset, sectors),
+       TP_STRUCT__entry(
+               __field(dev_t,                  dev                     )
+               __array(char,   reserve,        16                      )
+               __field(u64,                    free                    )
+               __field(u64,                    avail                   )
+               __field(u64,                    copygc_wait_amount      )
+               __field(s64,                    copygc_waiting_for      )
+               __field(u64,                    seen                    )
+               __field(u64,                    open                    )
+               __field(u64,                    need_journal_commit     )
+               __field(u64,                    nouse                   )
+               __field(bool,                   nonblocking             )
+               __array(char,                   err,    32              )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = ca->dev;
+               strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+               __entry->free           = free;
+               __entry->avail          = avail;
+               __entry->copygc_wait_amount     = copygc_wait_amount;
+               __entry->copygc_waiting_for     = copygc_waiting_for;
+               __entry->seen           = seen;
+               __entry->open           = open;
+               __entry->need_journal_commit = need_journal_commit;
+               __entry->nouse          = nouse;
+               __entry->nonblocking    = nonblocking;
+               strlcpy(__entry->err, err, sizeof(__entry->err));
+       ),
+
+       TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->reserve,
+                 __entry->free,
+                 __entry->avail,
+                 __entry->copygc_wait_amount,
+                 __entry->copygc_waiting_for,
+                 __entry->seen,
+                 __entry->open,
+                 __entry->need_journal_commit,
+                 __entry->nouse,
+                 __entry->nonblocking,
+                 __entry->err)
+);
+
+TRACE_EVENT(discard_buckets,
+       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+                u64 need_journal_commit, u64 discarded, const char *err),
+       TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
 
        TP_STRUCT__entry(
-               __field(unsigned,       sectors                 )
                __field(dev_t,          dev                     )
-               __field(__u64,          offset                  )
+               __field(u64,            seen                    )
+               __field(u64,            open                    )
+               __field(u64,            need_journal_commit     )
+               __field(u64,            discarded               )
+               __array(char,           err,    16              )
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->dev;
-               __entry->offset         = offset,
-               __entry->sectors        = sectors;
+               __entry->dev                    = c->dev;
+               __entry->seen                   = seen;
+               __entry->open                   = open;
+               __entry->need_journal_commit    = need_journal_commit;
+               __entry->discarded              = discarded;
+               strlcpy(__entry->err, err, sizeof(__entry->err));
        ),
 
-       TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-                 __entry->sectors,
-                 MAJOR(__entry->dev),
-                 MINOR(__entry->dev),
-                 __entry->offset)
+       TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->seen,
+                 __entry->open,
+                 __entry->need_journal_commit,
+                 __entry->discarded,
+                 __entry->err)
 );
 
-DECLARE_EVENT_CLASS(bucket_alloc,
-       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-       TP_ARGS(ca, reserve),
+TRACE_EVENT(bucket_invalidate,
+       TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+       TP_ARGS(c, dev, bucket, sectors),
 
        TP_STRUCT__entry(
-               __field(dev_t,                  dev     )
-               __field(enum alloc_reserve,     reserve )
+               __field(dev_t,          dev                     )
+               __field(u32,            dev_idx                 )
+               __field(u32,            sectors                 )
+               __field(u64,            bucket                  )
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->dev;
-               __entry->reserve        = reserve;
+               __entry->dev            = c->dev;
+               __entry->dev_idx        = dev;
+               __entry->sectors        = sectors;
+               __entry->bucket         = bucket;
        ),
 
-       TP_printk("%d,%d reserve %d",
+       TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->reserve)
+                 __entry->dev_idx, __entry->bucket,
+                 __entry->sectors)
 );
 
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
-       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-       TP_ARGS(ca, reserve)
-);
+/* Moving IO */
 
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-       TP_ARGS(ca, reserve)
+DEFINE_EVENT(bkey, move_extent_read,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k)
 );
 
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-       TP_ARGS(ca, reserve)
+DEFINE_EVENT(bkey, move_extent_write,
+       TP_PROTO(const struct bkey *k),
+       TP_ARGS(k)
 );
 
-/* Moving IO */
-
-DEFINE_EVENT(bkey, move_extent,
+DEFINE_EVENT(bkey, move_extent_finish,
        TP_PROTO(const struct bkey *k),
        TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_alloc_fail,
+DEFINE_EVENT(bkey, move_extent_race,
        TP_PROTO(const struct bkey *k),
        TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_race,
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
        TP_PROTO(const struct bkey *k),
        TP_ARGS(k)
 );
@@ -592,314 +753,300 @@ TRACE_EVENT(copygc_wait,
                  __entry->wait_amount, __entry->until)
 );
 
-DECLARE_EVENT_CLASS(transaction_restart,
-       TP_PROTO(const char *trans_fn,
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip),
+       TP_ARGS(trans, caller_ip),
 
        TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
+               __array(char,                   trans_fn, 32    )
                __field(unsigned long,          caller_ip       )
        ),
 
        TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
        ),
 
        TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      transaction_restart_ip,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        transaction_commit,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_blocked_journal_reclaim,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_restart_injected,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_blocked_journal_reclaim,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_preres_get,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_restart_journal_res_get,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
-       TP_PROTO(const char *trans_fn,
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                unsigned flags),
+       TP_ARGS(trans, caller_ip, flags),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+               __field(unsigned,               flags           )
+       ),
+
+       TP_fast_assign(
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+               __entry->flags                  = flags;
+       ),
+
+       TP_printk("%s %pS %x", __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event,        trans_restart_journal_reclaim,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_fault_inject,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_restart_fault_inject,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_traverse_all,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_traverse_all,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_restart_mark_replicas,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_key_cache_raced,
-       TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event,        trans_restart_key_cache_raced,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,        trans_restart_too_many_iters,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
-       TP_ARGS(trans_fn, caller_ip)
+       TP_ARGS(trans, caller_ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart_iter,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos),
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path),
 
        TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
+               __array(char,                   trans_fn, 32    )
                __field(unsigned long,          caller_ip       )
                __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
+               TRACE_BPOS_entries(pos)
        ),
 
        TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
+               __entry->btree_id               = path->btree_id;
+               TRACE_BPOS_assign(pos, path->pos)
        ),
 
-       TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 __entry->btree_id,
+                 bch2_btree_ids[__entry->btree_id],
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
-DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
-       TP_PROTO(const char *trans_fn,
+TRACE_EVENT(trans_restart_upgrade,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+                struct btree_path *path,
+                unsigned old_locks_want,
+                unsigned new_locks_want),
+       TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
 
-DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
-       TP_PROTO(const char *trans_fn,
-                unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u8,                     old_locks_want  )
+               __field(u8,                     new_locks_want  )
+               TRACE_BPOS_entries(pos)
+       ),
 
-DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
-       TP_PROTO(const char *trans_fn,
-                unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+       TP_fast_assign(
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = path->btree_id;
+               __entry->old_locks_want         = old_locks_want;
+               __entry->new_locks_want         = new_locks_want;
+               TRACE_BPOS_assign(pos, path->pos)
+       ),
+
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 bch2_btree_ids[__entry->btree_id],
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->old_locks_want,
+                 __entry->new_locks_want)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,        trans_restart_key_cache_upgrade,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *pos),
-       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
 );
 
-TRACE_EVENT(trans_restart_would_deadlock,
-       TP_PROTO(const char *trans_fn,
-                unsigned long  caller_ip,
-                bool           in_traverse_all,
-                unsigned       reason,
-                enum btree_id  have_btree_id,
-                unsigned       have_iter_type,
-                struct bpos    *have_pos,
-                enum btree_id  want_btree_id,
-                unsigned       want_iter_type,
-                struct bpos    *want_pos),
-       TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
-               have_btree_id, have_iter_type, have_pos,
-               want_btree_id, want_iter_type, want_pos),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     in_traverse_all )
-               __field(u8,                     reason          )
-               __field(u8,                     have_btree_id   )
-               __field(u8,                     have_iter_type  )
-               __field(u8,                     want_btree_id   )
-               __field(u8,                     want_iter_type  )
-
-               __field(u64,                    have_pos_inode  )
-               __field(u64,                    have_pos_offset )
-               __field(u32,                    have_pos_snapshot)
-               __field(u32,                    want_pos_snapshot)
-               __field(u64,                    want_pos_inode  )
-               __field(u64,                    want_pos_offset )
-       ),
-
-       TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->in_traverse_all        = in_traverse_all;
-               __entry->reason                 = reason;
-               __entry->have_btree_id          = have_btree_id;
-               __entry->have_iter_type         = have_iter_type;
-               __entry->want_btree_id          = want_btree_id;
-               __entry->want_iter_type         = want_iter_type;
-
-               __entry->have_pos_inode         = have_pos->inode;
-               __entry->have_pos_offset        = have_pos->offset;
-               __entry->have_pos_snapshot      = have_pos->snapshot;
+DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path)
+);
 
-               __entry->want_pos_inode         = want_pos->inode;
-               __entry->want_pos_offset        = want_pos->offset;
-               __entry->want_pos_snapshot      = want_pos->snapshot;
-       ),
+DEFINE_EVENT(transaction_event,        trans_restart_would_deadlock,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip)
+);
 
-       TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->in_traverse_all,
-                 __entry->reason,
-                 __entry->have_btree_id,
-                 __entry->have_iter_type,
-                 __entry->have_pos_inode,
-                 __entry->have_pos_offset,
-                 __entry->have_pos_snapshot,
-                 __entry->want_btree_id,
-                 __entry->want_iter_type,
-                 __entry->want_pos_inode,
-                 __entry->want_pos_offset,
-                 __entry->want_pos_snapshot)
+DEFINE_EVENT(transaction_event,        trans_restart_would_deadlock_recursion_limit,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock_write,
-       TP_PROTO(const char *trans_fn),
-       TP_ARGS(trans_fn),
+       TP_PROTO(struct btree_trans *trans),
+       TP_ARGS(trans),
 
        TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
+               __array(char,                   trans_fn, 32    )
        ),
 
        TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
        ),
 
        TP_printk("%s", __entry->trans_fn)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
-       TP_PROTO(const char *trans_fn,
+       TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
                 unsigned long bytes),
-       TP_ARGS(trans_fn, caller_ip, bytes),
+       TP_ARGS(trans, caller_ip, bytes),
 
        TP_STRUCT__entry(
-               __array(char,                   trans_fn, 24    )
+               __array(char,                   trans_fn, 32    )
                __field(unsigned long,          caller_ip       )
                __field(unsigned long,          bytes           )
        ),
 
        TP_fast_assign(
-               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
                __entry->caller_ip      = caller_ip;
                __entry->bytes          = bytes;
        ),
@@ -910,6 +1057,44 @@ TRACE_EVENT(trans_restart_mem_realloced,
                  __entry->bytes)
 );
 
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                struct btree_path *path,
+                unsigned old_u64s,
+                unsigned new_u64s),
+       TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+               __field(enum btree_id,          btree_id        )
+               TRACE_BPOS_entries(pos)
+               __field(u32,                    old_u64s        )
+               __field(u32,                    new_u64s        )
+       ),
+
+       TP_fast_assign(
+               strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+
+               __entry->btree_id       = path->btree_id;
+               TRACE_BPOS_assign(pos, path->pos);
+               __entry->old_u64s       = old_u64s;
+               __entry->new_u64s       = new_u64s;
+       ),
+
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 bch2_btree_ids[__entry->btree_id],
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->old_u64s,
+                 __entry->new_u64s)
+);
+
 #endif /* _TRACE_BCACHE_H */
 
 /* This part must be outside protection */
index f78ebf04f4a8616969f7908b13b16c8132e644c1..4fe2c3db401a41afb2f79319986bb4819e31d310 100644 (file)
@@ -365,503 +365,6 @@ struct bch_sb *__bch2_super_read(int fd, u64 sector)
        return ret;
 }
 
-static unsigned get_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
-       struct bch_sb_field_replicas *replicas;
-       struct bch_replicas_entry *r;
-       unsigned i, data_has = 0;
-
-       replicas = bch2_sb_get_replicas(sb);
-
-       if (replicas)
-               for_each_replicas_entry(replicas, r)
-                       for (i = 0; i < r->nr_devs; i++)
-                               if (r->devs[i] == dev)
-                                       data_has |= 1 << r->data_type;
-
-       return data_has;
-}
-
-static int bch2_sb_get_target(struct bch_sb *sb, char *buf, size_t len, u64 v)
-{
-       struct target t = target_decode(v);
-       int ret;
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return scnprintf(buf, len, "none");
-       case TARGET_DEV: {
-               struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-               struct bch_member *m = mi->members + t.dev;
-
-               if (bch2_dev_exists(sb, mi, t.dev)) {
-                       char uuid_str[40];
-
-                       uuid_unparse(m->uuid.b, uuid_str);
-
-                       ret = scnprintf(buf, len, "Device %u (%s)", t.dev,
-                               uuid_str);
-               } else {
-                       ret = scnprintf(buf, len, "Bad device %u", t.dev);
-               }
-
-               break;
-       }
-       case TARGET_GROUP: {
-               struct bch_sb_field_disk_groups *gi;
-               gi = bch2_sb_get_disk_groups(sb);
-
-               struct bch_disk_group *g = gi->entries + t.group;
-
-               if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) {
-                       ret = scnprintf(buf, len, "Label %u (%.*s)", t.group,
-                               BCH_SB_LABEL_SIZE, g->label);
-               } else {
-                       ret = scnprintf(buf, len, "Bad label %u", t.group);
-               }
-               break;
-       }
-       default:
-               BUG();
-       }
-
-       return ret;
-}
-
-/* superblock printing: */
-
-static void bch2_sb_print_layout(struct bch_sb *sb, enum units units)
-{
-       struct bch_sb_layout *l = &sb->layout;
-       unsigned i;
-
-       printf("  type:                         %u\n"
-              "  superblock max size:          %s\n"
-              "  nr superblocks:               %u\n"
-              "  Offsets:                      ",
-              l->layout_type,
-              pr_units(1 << l->sb_max_size_bits, units),
-              l->nr_superblocks);
-
-       for (i = 0; i < l->nr_superblocks; i++) {
-               if (i)
-                       printf(", ");
-               printf("%llu", le64_to_cpu(l->sb_offset[i]));
-       }
-       putchar('\n');
-}
-
-static void bch2_sb_print_journal(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum units units)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-       printf("  Buckets:                      ");
-       for (i = 0; i < nr; i++) {
-               if (i)
-                       putchar(' ');
-               printf("%llu", le64_to_cpu(journal->buckets[i]));
-       }
-       putchar('\n');
-}
-
-static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum units units)
-{
-       struct bch_sb_field_members *mi = field_to_type(f, members);
-       struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-       unsigned i;
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               time_t last_mount = le64_to_cpu(m->last_mount);
-               char member_uuid_str[40];
-               char data_allowed_str[100];
-               char data_has_str[100];
-               char label [BCH_SB_LABEL_SIZE+10];
-               char time_str[64];
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               uuid_unparse(m->uuid.b, member_uuid_str);
-
-               if (BCH_MEMBER_GROUP(m)) {
-                       unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-                       if (idx < disk_groups_nr(gi)) {
-                               scnprintf(label, sizeof(label), "%.*s (%u)",
-                                       BCH_SB_LABEL_SIZE,
-                                       gi->entries[idx].label, idx);
-                       } else {
-                               strcpy(label, "(bad disk labels section)");
-                       }
-               } else {
-                       strcpy(label, "(none)");
-               }
-
-               bch2_flags_to_text(&PBUF(data_allowed_str),
-                                  bch2_data_types,
-                                  BCH_MEMBER_DATA_ALLOWED(m));
-               if (!data_allowed_str[0])
-                       strcpy(data_allowed_str, "(none)");
-
-               bch2_flags_to_text(&PBUF(data_has_str),
-                                  bch2_data_types,
-                                  get_dev_has_data(sb, i));
-               if (!data_has_str[0])
-                       strcpy(data_has_str, "(none)");
-
-               if (last_mount) {
-                       struct tm *tm = localtime(&last_mount);
-                       size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
-                       if (!err)
-                               strcpy(time_str, "(formatting error)");
-               } else {
-                       strcpy(time_str, "(never)");
-               }
-
-               printf("  Device %u:\n"
-                      "    UUID:                       %s\n"
-                      "    Size:                       %s\n"
-                      "    Bucket size:                %s\n"
-                      "    First bucket:               %u\n"
-                      "    Buckets:                    %llu\n"
-                      "    Last mount:                 %s\n"
-                      "    State:                      %s\n"
-                      "    Group:                      %s\n"
-                      "    Data allowed:               %s\n"
-
-                      "    Has data:                   %s\n"
-
-                      "    Discard:                    %llu\n",
-                      i, member_uuid_str,
-                      pr_units(le16_to_cpu(m->bucket_size) *
-                               le64_to_cpu(m->nbuckets), units),
-                      pr_units(le16_to_cpu(m->bucket_size), units),
-                      le16_to_cpu(m->first_bucket),
-                      le64_to_cpu(m->nbuckets),
-                      time_str,
-
-                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
-                      : "unknown",
-
-                      label,
-                      data_allowed_str,
-                      data_has_str,
-
-                      BCH_MEMBER_DISCARD(m));
-       }
-}
-
-static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum units units)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       printf("  KFD:                  %llu\n"
-              "  scrypt n:             %llu\n"
-              "  scrypt r:             %llu\n"
-              "  scrypt p:             %llu\n",
-              BCH_CRYPT_KDF_TYPE(crypt),
-              BCH_KDF_SCRYPT_N(crypt),
-              BCH_KDF_SCRYPT_R(crypt),
-              BCH_KDF_SCRYPT_P(crypt));
-}
-
-static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
-                                     enum units units)
-{
-       struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0);
-       struct bch_replicas_entry_v0 *e;
-       unsigned i;
-
-       for_each_replicas_entry(replicas, e) {
-               printf_pad(32, "  %s:", bch2_data_types[e->data_type]);
-
-               putchar('[');
-               for (i = 0; i < e->nr_devs; i++) {
-                       if (i)
-                               putchar(' ');
-                       printf("%u", e->devs[i]);
-               }
-               printf("]\n");
-       }
-}
-
-static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f,
-                                  enum units units)
-{
-       struct bch_sb_field_replicas *replicas = field_to_type(f, replicas);
-       struct bch_replicas_entry *e;
-       unsigned i;
-
-       for_each_replicas_entry(replicas, e) {
-               printf_pad(32, "  %s: %u/%u",
-                          bch2_data_types[e->data_type],
-                          e->nr_required,
-                          e->nr_devs);
-
-               putchar('[');
-               for (i = 0; i < e->nr_devs; i++) {
-                       if (i)
-                               putchar(' ');
-                       printf("%u", e->devs[i]);
-               }
-               printf("]\n");
-       }
-}
-
-static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum units units)
-{
-}
-
-static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
-                                     enum units units)
-{
-}
-
-static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum units units)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-
-       printf("  flags:       %x", le32_to_cpu(clean->flags));
-       printf("  journal seq: %llx", le64_to_cpu(clean->journal_seq));
-}
-
-static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
-                                               enum units units)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist);
-       unsigned i, nr = blacklist_nr_entries(bl);
-
-       for (i = 0; i < nr; i++) {
-               struct journal_seq_blacklist_entry *e =
-                       bl->start + i;
-
-               printf("  %llu-%llu\n",
-                      le64_to_cpu(e->start),
-                      le64_to_cpu(e->end));
-       }
-}
-
-typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
-
-struct bch_sb_field_toolops {
-       sb_field_print_fn       print;
-};
-
-static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
-#define x(f, nr)                                       \
-       [BCH_SB_FIELD_##f] = {                          \
-               .print  = bch2_sb_print_##f,            \
-       },
-       BCH_SB_FIELDS()
-#undef x
-};
-
-static inline void bch2_sb_field_print(struct bch_sb *sb,
-                                      struct bch_sb_field *f,
-                                      enum units units)
-{
-       unsigned type = le32_to_cpu(f->type);
-
-       if (type < BCH_SB_FIELD_NR)
-               bch2_sb_field_ops[type].print(sb, f, units);
-       else
-               printf("(unknown field %u)\n", type);
-}
-
-void bch2_sb_print(struct bch_sb *sb, bool print_layout,
-                  unsigned fields, enum units units)
-{
-       struct bch_sb_field_members *mi;
-       char user_uuid_str[40], internal_uuid_str[40];
-       char features_str[500];
-       char compat_features_str[500];
-       char fields_have_str[200];
-       char label[BCH_SB_LABEL_SIZE + 1];
-       char time_str[64];
-       char foreground_str[64];
-       char background_str[64];
-       char promote_str[64];
-       char metadata_str[64];
-       struct bch_sb_field *f;
-       u64 fields_have = 0;
-       unsigned nr_devices = 0;
-       time_t time_base = le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC;
-
-       memcpy(label, sb->label, BCH_SB_LABEL_SIZE);
-       label[BCH_SB_LABEL_SIZE] = '\0';
-
-       uuid_unparse(sb->user_uuid.b, user_uuid_str);
-       uuid_unparse(sb->uuid.b, internal_uuid_str);
-
-       if (time_base) {
-               struct tm *tm = localtime(&time_base);
-               size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
-               if (!err)
-                       strcpy(time_str, "(formatting error)");
-       } else {
-               strcpy(time_str, "(not set)");
-       }
-
-       mi = bch2_sb_get_members(sb);
-       if (mi) {
-               struct bch_member *m;
-
-               for (m = mi->members;
-                    m < mi->members + sb->nr_devices;
-                    m++)
-                       nr_devices += bch2_member_exists(m);
-       }
-
-       bch2_sb_get_target(sb, foreground_str, sizeof(foreground_str),
-               BCH_SB_FOREGROUND_TARGET(sb));
-
-       bch2_sb_get_target(sb, background_str, sizeof(background_str),
-               BCH_SB_BACKGROUND_TARGET(sb));
-
-       bch2_sb_get_target(sb, promote_str, sizeof(promote_str),
-               BCH_SB_PROMOTE_TARGET(sb));
-
-       bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str),
-               BCH_SB_METADATA_TARGET(sb));
-
-       bch2_flags_to_text(&PBUF(features_str),
-                          bch2_sb_features,
-                          le64_to_cpu(sb->features[0]));
-
-       bch2_flags_to_text(&PBUF(compat_features_str),
-                          bch2_sb_compat,
-                          le64_to_cpu(sb->compat[0]));
-
-       vstruct_for_each(sb, f)
-               fields_have |= 1 << le32_to_cpu(f->type);
-       bch2_flags_to_text(&PBUF(fields_have_str),
-                          bch2_sb_fields, fields_have);
-
-       printf("External UUID:                  %s\n"
-              "Internal UUID:                  %s\n"
-              "Device index:                   %u\n"
-              "Label:                          %s\n"
-              "Version:                        %u\n"
-              "Oldest version on disk:         %u\n"
-              "Created:                        %s\n"
-              "Squence number:                 %llu\n"
-              "Block_size:                     %s\n"
-              "Btree node size:                %s\n"
-              "Error action:                   %s\n"
-              "Clean:                          %llu\n"
-              "Features:                       %s\n"
-              "Compat features:                %s\n"
-
-              "Metadata replicas:              %llu\n"
-              "Data replicas:                  %llu\n"
-
-              "Metadata checksum type:         %s (%llu)\n"
-              "Data checksum type:             %s (%llu)\n"
-              "Compression type:               %s (%llu)\n"
-
-              "Foreground write target:        %s\n"
-              "Background write target:        %s\n"
-              "Promote target:                 %s\n"
-               "Metadata target:                %s\n"
-
-              "String hash type:               %s (%llu)\n"
-              "32 bit inodes:                  %llu\n"
-              "GC reserve percentage:          %llu%%\n"
-              "Root reserve percentage:        %llu%%\n"
-
-              "Devices:                        %u live, %u total\n"
-              "Sections:                       %s\n"
-              "Superblock size:                %llu\n",
-              user_uuid_str,
-              internal_uuid_str,
-              sb->dev_idx,
-              label,
-              le16_to_cpu(sb->version),
-              le16_to_cpu(sb->version_min),
-              time_str,
-              le64_to_cpu(sb->seq),
-              pr_units(le16_to_cpu(sb->block_size), units),
-              pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
-
-              BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
-              ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
-              : "unknown",
-
-              BCH_SB_CLEAN(sb),
-              features_str,
-              compat_features_str,
-
-              BCH_SB_META_REPLICAS_WANT(sb),
-              BCH_SB_DATA_REPLICAS_WANT(sb),
-
-              BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-              ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)]
-              : "unknown",
-              BCH_SB_META_CSUM_TYPE(sb),
-
-              BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-              ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)]
-              : "unknown",
-              BCH_SB_DATA_CSUM_TYPE(sb),
-
-              BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR
-              ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)]
-              : "unknown",
-              BCH_SB_COMPRESSION_TYPE(sb),
-
-              foreground_str,
-              background_str,
-              promote_str,
-               metadata_str,
-
-              BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
-              ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
-              : "unknown",
-              BCH_SB_STR_HASH_TYPE(sb),
-
-              BCH_SB_INODE_32BIT(sb),
-              BCH_SB_GC_RESERVE(sb),
-              BCH_SB_ROOT_RESERVE(sb),
-
-              nr_devices, sb->nr_devices,
-              fields_have_str,
-              vstruct_bytes(sb));
-
-       if (print_layout) {
-               printf("\n"
-                      "Layout:\n");
-               bch2_sb_print_layout(sb, units);
-       }
-
-       vstruct_for_each(sb, f) {
-               unsigned type = le32_to_cpu(f->type);
-               char name[60];
-
-               if (!(fields & (1 << type)))
-                       continue;
-
-               if (type < BCH_SB_FIELD_NR) {
-                       scnprintf(name, sizeof(name), "%s", bch2_sb_fields[type]);
-                       name[0] = toupper(name[0]);
-               } else {
-                       scnprintf(name, sizeof(name), "(unknown field %u)", type);
-               }
-
-               printf("\n%s (size %llu):\n", name, vstruct_bytes(f));
-               if (type < BCH_SB_FIELD_NR)
-                       bch2_sb_field_print(sb, f, units);
-       }
-}
-
 /* ioctl interface: */
 
 /* Global control device: */
@@ -1094,6 +597,7 @@ next:
 struct bch_opts bch2_parse_opts(struct bch_opt_strs strs)
 {
        struct bch_opts opts = bch2_opts_empty();
+       struct printbuf err = PRINTBUF;
        unsigned i;
        int ret;
        u64 v;
@@ -1103,17 +607,16 @@ struct bch_opts bch2_parse_opts(struct bch_opt_strs strs)
                    bch2_opt_table[i].type == BCH_OPT_FN)
                        continue;
 
-               ret = bch2_opt_parse(NULL, "option",
+               ret = bch2_opt_parse(NULL,
                                     &bch2_opt_table[i],
-                                    strs.by_id[i], &v);
+                                    strs.by_id[i], &v, &err);
                if (ret < 0)
-                       die("Invalid %s: %s",
-                           bch2_opt_table[i].attr.name,
-                           strerror(-ret));
+                       die("Invalid option %s", err.buf);
 
                bch2_opt_set_by_id(&opts, i, v);
        }
 
+       printbuf_exit(&err);
        return opts;
 }
 
@@ -1186,7 +689,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs)
        struct dirent *d;
        dev_names devs;
 
-       darray_init(devs);
+       darray_init(&devs);
 
        while ((errno = 0), (d = readdir(dir))) {
                struct dev_name n = { 0, NULL, NULL };
@@ -1210,7 +713,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs)
                n.label = read_file_str(fs.sysfs_fd, label_attr);
                free(label_attr);
 
-               darray_append(devs, n);
+               darray_push(&devs, n);
        }
 
        closedir(dir);
index ab4f0cd67fa079306a89c71cc34f059d55c8308c..17e8eef3d9998d037c2787e2c431647e5e17e933 100644 (file)
@@ -79,8 +79,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs,
 void bch2_super_write(int, struct bch_sb *);
 struct bch_sb *__bch2_super_read(int, u64);
 
-void bch2_sb_print(struct bch_sb *, bool, unsigned, enum units);
-
 /* ioctl interface: */
 
 int bcachectl_open(void);
@@ -239,7 +237,7 @@ struct dev_name {
        char            *label;
        uuid_le         uuid;
 };
-typedef darray(struct dev_name) dev_names;
+typedef DARRAY(struct dev_name) dev_names;
 
 dev_names bchu_fs_get_devices(struct bchfs_handle);
 
index 5070caf8f349adbec532ae966346fe22e6fda142..9592541f7b5ce8159f77f70052a905e330125e78 100644 (file)
@@ -173,7 +173,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
        bkey_xattr_init(&xattr->k_i);
        xattr->k.u64s           = u64s;
        xattr->v.x_type         = acl_to_xattr_type(type);
-       xattr->v.x_name_len     = 0,
+       xattr->v.x_name_len     = 0;
        xattr->v.x_val_len      = cpu_to_le16(acl_len);
 
        acl_header = xattr_val(&xattr->v);
@@ -236,7 +236,7 @@ retry:
                        &X_SEARCH(acl_to_xattr_type(type), "", 0),
                        0);
        if (ret) {
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
                if (ret != -ENOENT)
                        acl = ERR_PTR(ret);
@@ -335,7 +335,7 @@ retry:
 btree_err:
        bch2_trans_iter_exit(&trans, &inode_iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err;
index 023db6219ad878f8106b439895adf7d8f362a358..796b9f5afe8c66c3067888f3f4426246b5a6e94d 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_key_cache.h"
@@ -14,6 +15,7 @@
 #include "debug.h"
 #include "ec.h"
 #include "error.h"
+#include "lru.h"
 #include "recovery.h"
 #include "varint.h"
 
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-const char * const bch2_allocator_states[] = {
-#define x(n)   #n,
-       ALLOC_THREAD_STATES()
-#undef x
-       NULL
-};
+/* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
@@ -39,7 +36,17 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-/* Persistent alloc info: */
+struct bkey_alloc_unpacked {
+       u64             journal_seq;
+       u8              gen;
+       u8              oldest_gen;
+       u8              data_type;
+       bool            need_discard:1;
+       bool            need_inc_gen:1;
+#define x(_name, _bits)        u##_bits _name;
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
+};
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
                                     const void **p, unsigned field)
@@ -161,6 +168,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
        out->gen        = a.v->gen;
        out->oldest_gen = a.v->oldest_gen;
        out->data_type  = a.v->data_type;
+       out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+       out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
        out->journal_seq = le64_to_cpu(a.v->journal_seq);
 
 #define x(_name, _bits)                                                        \
@@ -182,53 +191,9 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
        return 0;
 }
 
-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
-                              const struct bkey_alloc_unpacked src)
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
-       struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
-       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-       u8 *out = a->v.data;
-       u8 *end = (void *) &dst[1];
-       u8 *last_nonzero_field = out;
-       unsigned bytes;
-
-       a->k.p          = POS(src.dev, src.bucket);
-       a->v.gen        = src.gen;
-       a->v.oldest_gen = src.oldest_gen;
-       a->v.data_type  = src.data_type;
-       a->v.journal_seq = cpu_to_le64(src.journal_seq);
-
-#define x(_name, _bits)                                                        \
-       nr_fields++;                                                    \
-                                                                       \
-       if (src._name) {                                                \
-               out += bch2_varint_encode_fast(out, src._name);         \
-                                                                       \
-               last_nonzero_field = out;                               \
-               last_nonzero_fieldnr = nr_fields;                       \
-       } else {                                                        \
-               *out++ = 0;                                             \
-       }
-
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-       BUG_ON(out > end);
-
-       out = last_nonzero_field;
-       a->v.nr_fields = last_nonzero_fieldnr;
-
-       bytes = (u8 *) out - (u8 *) &a->v;
-       set_bkey_val_bytes(&a->k, bytes);
-       memset_u64s_tail(&a->v, 0, bytes);
-}
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
-       struct bkey_alloc_unpacked ret = {
-               .dev    = k.k->p.inode,
-               .bucket = k.k->p.offset,
-               .gen    = 0,
-       };
+       struct bkey_alloc_unpacked ret = { .gen = 0 };
 
        switch (k.k->type) {
        case KEY_TYPE_alloc:
@@ -245,653 +210,1189 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
        return ret;
 }
 
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
-                                      const struct bkey_alloc_unpacked src)
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 {
-       struct bkey_alloc_buf *dst;
+       unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-       dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-       if (!IS_ERR(dst))
-               bch2_alloc_pack_v3(dst, src);
+       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+               if (a->fields & (1 << i))
+                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
 
-       return dst;
+       return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
-                    struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
-       struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
+       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+       /* allow for unknown fields */
+       if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
+               prt_printf(err, "incorrect value size (%zu < %u)",
+                      bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+               return -EINVAL;
+       }
 
-       return PTR_ERR_OR_ZERO(a) ?:
-               bch2_trans_update(trans, iter, &a->k, trigger_flags);
+       return 0;
 }
 
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
-       unsigned i, bytes = offsetof(struct bch_alloc, data);
+       struct bkey_alloc_unpacked u;
 
-       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
-               if (a->fields & (1 << i))
-                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+       if (bch2_alloc_unpack_v2(&u, k)) {
+               prt_printf(err, "unpack error");
+               return -EINVAL;
+       }
 
-       return DIV_ROUND_UP(bytes, sizeof(u64));
+       return 0;
 }
 
-const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
-       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+       struct bkey_alloc_unpacked u;
 
-       if (k.k->p.inode >= c->sb.nr_devices ||
-           !c->devs[k.k->p.inode])
-               return "invalid device";
+       if (bch2_alloc_unpack_v3(&u, k)) {
+               prt_printf(err, "unpack error");
+               return -EINVAL;
+       }
 
-       /* allow for unknown fields */
-       if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
-               return "incorrect value size";
+       return 0;
+}
+
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
+{
+       struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
+       if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+               prt_printf(err, "bad val size (%lu != %u)",
+                      bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+               return -EINVAL;
+       }
+
+       if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+           BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+               prt_printf(err, "invalid backpointers_start");
+               return -EINVAL;
+       }
+
+       if (rw == WRITE) {
+               if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+                       prt_printf(err, "invalid data type (got %u should be %u)",
+                              a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+                       return -EINVAL;
+               }
 
-       return NULL;
+               switch (a.v->data_type) {
+               case BCH_DATA_free:
+               case BCH_DATA_need_gc_gens:
+               case BCH_DATA_need_discard:
+                       if (a.v->dirty_sectors ||
+                           a.v->cached_sectors ||
+                           a.v->stripe) {
+                               prt_printf(err, "empty data type free but have data");
+                               return -EINVAL;
+                       }
+                       break;
+               case BCH_DATA_sb:
+               case BCH_DATA_journal:
+               case BCH_DATA_btree:
+               case BCH_DATA_user:
+               case BCH_DATA_parity:
+                       if (!a.v->dirty_sectors) {
+                               prt_printf(err, "data_type %s but dirty_sectors==0",
+                                      bch2_data_types[a.v->data_type]);
+                               return -EINVAL;
+                       }
+                       break;
+               case BCH_DATA_cached:
+                       if (!a.v->cached_sectors ||
+                           a.v->dirty_sectors ||
+                           a.v->stripe) {
+                               prt_printf(err, "data type inconsistency");
+                               return -EINVAL;
+                       }
+
+                       if (!a.v->io_time[READ] &&
+                           test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
+                               prt_printf(err, "cached bucket with read_time == 0");
+                               return -EINVAL;
+                       }
+                       break;
+               case BCH_DATA_stripe:
+                       if (!a.v->stripe) {
+                               prt_printf(err, "data_type %s but stripe==0",
+                                      bch2_data_types[a.v->data_type]);
+                               return -EINVAL;
+                       }
+                       break;
+               }
+       }
+
+       return 0;
 }
 
-const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static inline u64 swab40(u64 x)
 {
-       struct bkey_alloc_unpacked u;
+       return (((x & 0x00000000ffULL) << 32)|
+               ((x & 0x000000ff00ULL) << 16)|
+               ((x & 0x0000ff0000ULL) >>  0)|
+               ((x & 0x00ff000000ULL) >> 16)|
+               ((x & 0xff00000000ULL) >> 32));
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+       struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+       struct bch_backpointer *bp, *bps;
+
+       a->journal_seq          = swab64(a->journal_seq);
+       a->flags                = swab32(a->flags);
+       a->dirty_sectors        = swab32(a->dirty_sectors);
+       a->cached_sectors       = swab32(a->cached_sectors);
+       a->io_time[0]           = swab64(a->io_time[0]);
+       a->io_time[1]           = swab64(a->io_time[1]);
+       a->stripe               = swab32(a->stripe);
+       a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+       bps = alloc_v4_backpointers(a);
+       for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+               bp->bucket_offset       = swab40(bp->bucket_offset);
+               bp->bucket_len          = swab32(bp->bucket_len);
+               bch2_bpos_swab(&bp->pos);
+       }
+}
 
-       if (k.k->p.inode >= c->sb.nr_devices ||
-           !c->devs[k.k->p.inode])
-               return "invalid device";
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bch_alloc_v4 _a;
+       const struct bch_alloc_v4 *a = &_a;
+       const struct bch_backpointer *bps;
+       unsigned i;
 
-       if (bch2_alloc_unpack_v2(&u, k))
-               return "unpack error";
+       if (k.k->type == KEY_TYPE_alloc_v4)
+               a = bkey_s_c_to_alloc_v4(k).v;
+       else
+               bch2_alloc_to_v4(k, &_a);
+
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_printf(out, "gen %u oldest_gen %u data_type %s",
+              a->gen, a->oldest_gen, bch2_data_types[a->data_type]);
+       prt_newline(out);
+       prt_printf(out, "journal_seq       %llu",       a->journal_seq);
+       prt_newline(out);
+       prt_printf(out, "need_discard      %llu",       BCH_ALLOC_V4_NEED_DISCARD(a));
+       prt_newline(out);
+       prt_printf(out, "need_inc_gen      %llu",       BCH_ALLOC_V4_NEED_INC_GEN(a));
+       prt_newline(out);
+       prt_printf(out, "dirty_sectors     %u", a->dirty_sectors);
+       prt_newline(out);
+       prt_printf(out, "cached_sectors    %u", a->cached_sectors);
+       prt_newline(out);
+       prt_printf(out, "stripe            %u", a->stripe);
+       prt_newline(out);
+       prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
+       prt_newline(out);
+       prt_printf(out, "io_time[READ]     %llu",       a->io_time[READ]);
+       prt_newline(out);
+       prt_printf(out, "io_time[WRITE]    %llu",       a->io_time[WRITE]);
+       prt_newline(out);
+       prt_printf(out, "backpointers:     %llu",       BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+       printbuf_indent_add(out, 2);
+
+       bps = alloc_v4_backpointers_c(a);
+       for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) {
+               prt_newline(out);
+               bch2_backpointer_to_text(out, &bps[i]);
+       }
 
-       return NULL;
+       printbuf_indent_sub(out, 4);
 }
 
-const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 {
-       struct bkey_alloc_unpacked u;
+       if (k.k->type == KEY_TYPE_alloc_v4) {
+               int d;
+
+               *out = *bkey_s_c_to_alloc_v4(k).v;
+
+               d = (int) BCH_ALLOC_V4_U64s -
+                       (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0);
+               if (unlikely(d > 0)) {
+                       memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out),
+                              0,
+                              d * sizeof(u64));
+                       SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+               }
+       } else {
+               struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+               *out = (struct bch_alloc_v4) {
+                       .journal_seq            = u.journal_seq,
+                       .flags                  = u.need_discard,
+                       .gen                    = u.gen,
+                       .oldest_gen             = u.oldest_gen,
+                       .data_type              = u.data_type,
+                       .stripe_redundancy      = u.stripe_redundancy,
+                       .dirty_sectors          = u.dirty_sectors,
+                       .cached_sectors         = u.cached_sectors,
+                       .io_time[READ]          = u.read_time,
+                       .io_time[WRITE]         = u.write_time,
+                       .stripe                 = u.stripe,
+               };
+
+               SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+       }
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+       struct bkey_i_alloc_v4 *ret;
+       unsigned bytes = k.k->type == KEY_TYPE_alloc_v4
+               ? bkey_bytes(k.k)
+               : sizeof(struct bkey_i_alloc_v4);
+
+       /*
+        * Reserve space for one more backpointer here:
+        * Not sketchy at doing it this way, nope...
+        */
+       ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
+       if (IS_ERR(ret))
+               return ret;
 
-       if (k.k->p.inode >= c->sb.nr_devices ||
-           !c->devs[k.k->p.inode])
-               return "invalid device";
+       if (k.k->type == KEY_TYPE_alloc_v4) {
+               struct bch_backpointer *src, *dst;
 
-       if (bch2_alloc_unpack_v3(&u, k))
-               return "unpack error";
+               bkey_reassemble(&ret->k_i, k);
 
-       return NULL;
+               src = alloc_v4_backpointers(&ret->v);
+               SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+               dst = alloc_v4_backpointers(&ret->v);
+
+               memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
+                       sizeof(struct bch_backpointer));
+               memset(src, 0, dst - src);
+               set_alloc_v4_u64s(ret);
+       } else {
+               bkey_alloc_v4_init(&ret->k_i);
+               ret->k.p = k.k->p;
+               bch2_alloc_to_v4(k, &ret->v);
+       }
+       return ret;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-                          struct bkey_s_c k)
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
 {
-       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+       if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+           BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+               /*
+                * Reserve space for one more backpointer here:
+                * Not sketchy at doing it this way, nope...
+                */
+               struct bkey_i_alloc_v4 *ret =
+                       bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
+               if (!IS_ERR(ret))
+                       bkey_reassemble(&ret->k_i, k);
+               return ret;
+       }
 
-       pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
-              u.gen, u.oldest_gen, bch2_data_types[u.data_type],
-              u.journal_seq);
-#define x(_name, ...)  pr_buf(out, " " #_name " %llu", (u64) u._name);
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
+       return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+       return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+                             struct bpos pos)
+{
+       struct bkey_s_c k;
+       struct bkey_i_alloc_v4 *a;
+       int ret;
+
+       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret) {
+               bch2_trans_iter_exit(trans, iter);
+               return ERR_PTR(ret);
+       }
+
+       a = bch2_alloc_to_v4_mut_inlined(trans, k);
+       if (IS_ERR(a))
+               bch2_trans_iter_exit(trans, iter);
+       return a;
 }
 
-int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
+int bch2_alloc_read(struct bch_fs *c)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bch_alloc_v4 a;
        struct bch_dev *ca;
-       struct bucket *g;
-       struct bkey_alloc_unpacked u;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
+               /*
+                * Not a fsck error because this is checked/repaired by
+                * bch2_check_alloc_key() which runs later:
+                */
+               if (!bch2_dev_bucket_exists(c, k.k->p))
+                       continue;
+
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
-               g = __bucket(ca, k.k->p.offset, gc);
-               u = bch2_alloc_unpack(k);
-
-               if (!gc)
-                       *bucket_gen(ca, k.k->p.offset) = u.gen;
-
-               g->_mark.gen            = u.gen;
-               g->io_time[READ]        = u.read_time;
-               g->io_time[WRITE]       = u.write_time;
-               g->oldest_gen           = !gc ? u.oldest_gen : u.gen;
-               g->gen_valid            = 1;
-
-               if (!gc ||
-                   (metadata_only &&
-                    (u.data_type == BCH_DATA_user ||
-                     u.data_type == BCH_DATA_cached ||
-                     u.data_type == BCH_DATA_parity))) {
-                       g->_mark.data_type      = u.data_type;
-                       g->_mark.dirty_sectors  = u.dirty_sectors;
-                       g->_mark.cached_sectors = u.cached_sectors;
-                       g->_mark.stripe         = u.stripe != 0;
-                       g->stripe               = u.stripe;
-                       g->stripe_redundancy    = u.stripe_redundancy;
-               }
+               bch2_alloc_to_v4(k, &a);
 
+               *bucket_gen(ca, k.k->p.offset) = a.gen;
        }
        bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
        if (ret)
-               bch_err(c, "error reading alloc info: %i", ret);
+               bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
 
        return ret;
 }
 
-/* Bucket IO clocks: */
+/* Free space/discard btree: */
 
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-                             size_t bucket_nr, int rw)
+static int bch2_bucket_do_index(struct btree_trans *trans,
+                               struct bkey_s_c alloc_k,
+                               const struct bch_alloc_v4 *a,
+                               bool set)
 {
        struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
        struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
-       u64 *time, now;
-       int ret = 0;
+       struct bkey_s_c old;
+       struct bkey_i *k;
+       enum btree_id btree;
+       enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       struct printbuf buf = PRINTBUF;
+       int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto out;
+       if (a->data_type != BCH_DATA_free &&
+           a->data_type != BCH_DATA_need_discard)
+               return 0;
 
-       u = bch2_alloc_unpack(k);
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
 
-       time = rw == READ ? &u.read_time : &u.write_time;
-       now = atomic64_read(&c->io_clock[rw].now);
-       if (*time == now)
-               goto out;
+       bkey_init(&k->k);
+       k->k.type = new_type;
 
-       *time = now;
+       switch (a->data_type) {
+       case BCH_DATA_free:
+               btree = BTREE_ID_freespace;
+               k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+               bch2_key_resize(&k->k, 1);
+               break;
+       case BCH_DATA_need_discard:
+               btree = BTREE_ID_need_discard;
+               k->k.p = alloc_k.k->p;
+               break;
+       default:
+               return 0;
+       }
 
-       ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0);
-out:
+       bch2_trans_iter_init(trans, &iter, btree,
+                            bkey_start_pos(&k->k),
+                            BTREE_ITER_INTENT);
+       old = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(old);
+       if (ret)
+               goto err;
+
+       if (ca->mi.freespace_initialized &&
+           bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+                       "incorrect key when %s %s btree (got %s should be %s)\n"
+                       "  for %s",
+                       set ? "setting" : "clearing",
+                       bch2_btree_ids[btree],
+                       bch2_bkey_types[old.k->type],
+                       bch2_bkey_types[old_type],
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               ret = -EIO;
+               goto err;
+       }
+
+       ret = bch2_trans_update(trans, &iter, k, 0);
+err:
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
        return ret;
 }
 
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-                                      struct bucket_mark m)
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
+                         struct bkey_s_c old, struct bkey_i *new,
+                         unsigned flags)
 {
-       u8 gc_gen;
-
-       if (!is_available_bucket(m))
-               return false;
+       struct bch_fs *c = trans->c;
+       struct bch_alloc_v4 old_a, *new_a;
+       u64 old_lru, new_lru;
+       int ret = 0;
 
-       if (m.owned_by_allocator)
-               return false;
+       /*
+        * Deletion only happens in the device removal path, with
+        * BTREE_TRIGGER_NORUN:
+        */
+       BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
 
-       if (ca->buckets_nouse &&
-           test_bit(b, ca->buckets_nouse))
-               return false;
+       bch2_alloc_to_v4(old, &old_a);
+       new_a = &bkey_i_to_alloc_v4(new)->v;
 
-       if (ca->new_fs_bucket_idx) {
-               /*
-                * Device or filesystem is still being initialized, and we
-                * haven't fully marked superblocks & journal:
-                */
-               if (is_superblock_bucket(ca, b))
-                       return false;
+       new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
 
-               if (b < ca->new_fs_bucket_idx)
-                       return false;
+       if (new_a->dirty_sectors > old_a.dirty_sectors ||
+           new_a->cached_sectors > old_a.cached_sectors) {
+               new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+               new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+               SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+               SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
        }
 
-       gc_gen = bucket_gc_gen(bucket(ca, b));
-
-       ca->inc_gen_needs_gc            += gc_gen >= BUCKET_GC_GEN_MAX / 2;
-       ca->inc_gen_really_needs_gc     += gc_gen >= BUCKET_GC_GEN_MAX;
+       if (data_type_is_empty(new_a->data_type) &&
+           BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+           !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+               new_a->gen++;
+               SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+       }
 
-       return gc_gen < BUCKET_GC_GEN_MAX;
-}
+       if (old_a.data_type != new_a->data_type ||
+           (new_a->data_type == BCH_DATA_free &&
+            alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
+               ret =   bch2_bucket_do_index(trans, old, &old_a, false) ?:
+                       bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+               if (ret)
+                       return ret;
+       }
 
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
+       if (new_a->data_type == BCH_DATA_cached &&
+           !new_a->io_time[READ])
+               new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-                               u64 now, u64 last_seq_ondisk)
-{
-       unsigned used = m.cached_sectors;
+       old_lru = alloc_lru_idx(old_a);
+       new_lru = alloc_lru_idx(*new_a);
 
-       if (used) {
-               /*
-                * Prefer to keep buckets that have been read more recently, and
-                * buckets that have more data in them:
-                */
-               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
+       if (old_lru != new_lru) {
+               ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
+                                     old_lru, &new_lru, old);
+               if (ret)
+                       return ret;
 
-               return -last_read_scaled;
-       } else {
-               /*
-                * Prefer to use buckets with smaller gc_gen so that we don't
-                * have to walk the btree and recalculate oldest_gen - but shift
-                * off the low bits so that buckets will still have equal sort
-                * keys when there's only a small difference, so that we can
-                * keep sequential buckets together:
-                */
-               return bucket_gc_gen(g) >> 4;
+               if (new_a->data_type == BCH_DATA_cached)
+                       new_a->io_time[READ] = new_lru;
        }
-}
 
-static inline int bucket_alloc_cmp(alloc_heap *h,
-                                  struct alloc_heap_entry l,
-                                  struct alloc_heap_entry r)
-{
-       return  cmp_int(l.key, r.key) ?:
-               cmp_int(r.nr, l.nr) ?:
-               cmp_int(l.bucket, r.bucket);
+       return 0;
 }
 
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
+static int bch2_check_alloc_key(struct btree_trans *trans,
+                               struct btree_iter *alloc_iter,
+                               struct btree_iter *discard_iter,
+                               struct btree_iter *freespace_iter)
 {
-       const struct alloc_heap_entry *l = _l, *r = _r;
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca;
+       struct bch_alloc_v4 a;
+       unsigned discard_key_type, freespace_key_type;
+       struct bkey_s_c alloc_k, k;
+       struct printbuf buf = PRINTBUF;
+       int ret;
 
-       return cmp_int(l->bucket, r->bucket);
-}
+       alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos)
+               ? bch2_btree_iter_peek_slot(alloc_iter)
+               : bch2_btree_iter_peek(alloc_iter);
+       if (!alloc_k.k)
+               return 1;
 
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bucket_array *buckets;
-       struct alloc_heap_entry e = { 0 };
-       u64 now, last_seq_ondisk;
-       size_t b, i, nr = 0;
+       ret = bkey_err(alloc_k);
+       if (ret)
+               return ret;
 
-       down_read(&ca->bucket_lock);
+       if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+                       "alloc key for invalid device:bucket %llu:%llu",
+                       alloc_k.k->p.inode, alloc_k.k->p.offset))
+               return bch2_btree_delete_at(trans, alloc_iter, 0);
 
-       buckets = bucket_array(ca);
-       ca->alloc_heap.used = 0;
-       now = atomic64_read(&c->io_clock[READ].now);
-       last_seq_ondisk = c->journal.flushed_seq_ondisk;
+       ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+       if (!ca->mi.freespace_initialized)
+               return 0;
 
-       /*
-        * Find buckets with lowest read priority, by building a maxheap sorted
-        * by read priority and repeatedly replacing the maximum element until
-        * all buckets have been visited.
-        */
-       for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket *g = &buckets->b[b];
-               struct bucket_mark m = READ_ONCE(g->mark);
-               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+       bch2_alloc_to_v4(alloc_k, &a);
 
-               cond_resched();
+       discard_key_type = a.data_type == BCH_DATA_need_discard
+               ? KEY_TYPE_set : 0;
+       freespace_key_type = a.data_type == BCH_DATA_free
+               ? KEY_TYPE_set : 0;
 
-               if (!bch2_can_invalidate_bucket(ca, b, m))
-                       continue;
+       bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+       bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a));
 
-               if (!m.data_type &&
-                   bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-                                                    last_seq_ondisk,
-                                                    ca->dev_idx, b)) {
-                       ca->buckets_waiting_on_journal++;
-                       continue;
-               }
+       k = bch2_btree_iter_peek_slot(discard_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
 
-               if (e.nr && e.bucket + e.nr == b && e.key == key) {
-                       e.nr++;
-               } else {
-                       if (e.nr)
-                               heap_add_or_replace(&ca->alloc_heap, e,
-                                       -bucket_alloc_cmp, NULL);
-
-                       e = (struct alloc_heap_entry) {
-                               .bucket = b,
-                               .nr     = 1,
-                               .key    = key,
-                       };
-               }
+       if (k.k->type != discard_key_type &&
+           (c->opts.reconstruct_alloc ||
+            fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+                     "  %s",
+                     bch2_bkey_types[k.k->type],
+                     bch2_bkey_types[discard_key_type],
+                     (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
+
+               bkey_init(&update->k);
+               update->k.type  = discard_key_type;
+               update->k.p     = discard_iter->pos;
+
+               ret = bch2_trans_update(trans, discard_iter, update, 0);
+               if (ret)
+                       goto err;
        }
 
-       if (e.nr)
-               heap_add_or_replace(&ca->alloc_heap, e,
-                               -bucket_alloc_cmp, NULL);
+       k = bch2_btree_iter_peek_slot(freespace_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != freespace_key_type &&
+           (c->opts.reconstruct_alloc ||
+            fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+                     "  %s",
+                     bch2_bkey_types[k.k->type],
+                     bch2_bkey_types[freespace_key_type],
+                     (printbuf_reset(&buf),
+                      bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
 
-       for (i = 0; i < ca->alloc_heap.used; i++)
-               nr += ca->alloc_heap.data[i].nr;
+               bkey_init(&update->k);
+               update->k.type  = freespace_key_type;
+               update->k.p     = freespace_iter->pos;
+               bch2_key_resize(&update->k, 1);
 
-       while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-               nr -= ca->alloc_heap.data[0].nr;
-               heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
+               ret = bch2_trans_update(trans, freespace_iter, update, 0);
+               if (ret)
+                       goto err;
        }
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+                                           struct btree_iter *iter)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter;
+       struct bkey_s_c alloc_k;
+       struct bch_alloc_v4 a;
+       u64 genbits;
+       struct bpos pos;
+       enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+               ? BCH_DATA_need_discard
+               : BCH_DATA_free;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       pos = iter->pos;
+       pos.offset &= ~(~0ULL << 56);
+       genbits = iter->pos.offset & (~0ULL << 56);
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
 
-       up_read(&ca->bucket_lock);
+       if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+                       "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+                       bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+               goto delete;
+
+       alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               goto err;
+
+       bch2_alloc_to_v4(alloc_k, &a);
+
+       if (fsck_err_on(a.data_type != state ||
+                       (state == BCH_DATA_free &&
+                        genbits != alloc_freespace_genbits(a)), c,
+                       "%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+                       bch2_btree_ids[iter->btree_id],
+                       a.data_type == state,
+                       genbits >> 56, alloc_freespace_genbits(a) >> 56))
+               goto delete;
+out:
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+delete:
+       ret = bch2_btree_delete_extent_at(trans, iter,
+                       iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
+       goto out;
 }
 
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+int bch2_check_alloc_info(struct bch_fs *c)
 {
-       size_t i, nr = 0;
+       struct btree_trans trans;
+       struct btree_iter iter, discard_iter, freespace_iter;
+       struct bkey_s_c k;
+       int ret = 0;
 
-       ca->inc_gen_needs_gc                    = 0;
-       ca->inc_gen_really_needs_gc             = 0;
-       ca->buckets_waiting_on_journal          = 0;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       find_reclaimable_buckets_lru(c, ca);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+                            BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+                            BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+                            BTREE_ITER_PREFETCH);
+       while (1) {
+               ret = commit_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       bch2_check_alloc_key(&trans, &iter,
+                                            &discard_iter,
+                                            &freespace_iter));
+               if (ret)
+                       break;
 
-       heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+               bch2_btree_iter_advance(&iter);
+       }
+       bch2_trans_iter_exit(&trans, &freespace_iter);
+       bch2_trans_iter_exit(&trans, &discard_iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
-       for (i = 0; i < ca->alloc_heap.used; i++)
-               nr += ca->alloc_heap.data[i].nr;
+       if (ret < 0)
+               goto err;
 
-       return nr;
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_need_discard, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+               bch2_check_discard_freespace_key(&trans, &iter)) ?:
+             for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_freespace, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+               bch2_check_discard_freespace_key(&trans, &iter));
+err:
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
 }
 
-static int bucket_invalidate_btree(struct btree_trans *trans,
-                                  struct bch_dev *ca, u64 b,
-                                  struct bkey_alloc_unpacked *u)
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+                                      struct btree_iter *alloc_iter)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct btree_iter lru_iter;
+       struct bch_alloc_v4 a;
+       struct bkey_s_c alloc_k, k;
+       struct printbuf buf = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            POS(ca->dev_idx, b),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
+       alloc_k = bch2_btree_iter_peek(alloc_iter);
+       if (!alloc_k.k)
+               return 0;
 
-       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               return ret;
+
+       bch2_alloc_to_v4(alloc_k, &a);
+
+       if (a.data_type != BCH_DATA_cached)
+               return 0;
+
+       bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+                            POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
+
+       k = bch2_btree_iter_peek_slot(&lru_iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       *u = bch2_alloc_unpack(k);
-       u->gen++;
-       u->data_type            = 0;
-       u->dirty_sectors        = 0;
-       u->cached_sectors       = 0;
-       u->read_time            = atomic64_read(&c->io_clock[READ].now);
-       u->write_time           = atomic64_read(&c->io_clock[WRITE].now);
+       if (fsck_err_on(!a.io_time[READ], c,
+                       "cached bucket with read_time 0\n"
+                       "  %s",
+               (printbuf_reset(&buf),
+                bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+           fsck_err_on(k.k->type != KEY_TYPE_lru ||
+                       le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
+                       "incorrect/missing lru entry\n"
+                       "  %s\n"
+                       "  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+               u64 read_time = a.io_time[READ];
+
+               if (!a.io_time[READ])
+                       a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+
+               ret = bch2_lru_set(trans,
+                                  alloc_k.k->p.inode,
+                                  alloc_k.k->p.offset,
+                                  &a.io_time[READ]);
+               if (ret)
+                       goto err;
+
+               if (a.io_time[READ] != read_time) {
+                       struct bkey_i_alloc_v4 *a_mut =
+                               bch2_alloc_to_v4_mut(trans, alloc_k);
+                       ret = PTR_ERR_OR_ZERO(a_mut);
+                       if (ret)
+                               goto err;
 
-       ret = bch2_alloc_write(trans, &iter, u,
-                              BTREE_TRIGGER_BUCKET_INVALIDATE);
+                       a_mut->v.io_time[READ] = a.io_time[READ];
+                       ret = bch2_trans_update(trans, alloc_iter,
+                                               &a_mut->k_i, BTREE_TRIGGER_NORUN);
+                       if (ret)
+                               goto err;
+               }
+       }
 err:
-       bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+       bch2_trans_iter_exit(trans, &lru_iter);
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf);
        return ret;
 }
 
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                     u64 *journal_seq, unsigned flags)
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 {
-       struct bkey_alloc_unpacked u;
-       size_t b;
-       u64 commit_seq = 0;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        int ret = 0;
 
-       /*
-        * If the read-only path is trying to shut down, we can't be generating
-        * new btree updates:
-        */
-       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
-               return 1;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       BUG_ON(!ca->alloc_heap.used ||
-              !ca->alloc_heap.data[0].nr);
-       b = ca->alloc_heap.data[0].bucket;
+       for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+                       POS_MIN, BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+               bch2_check_alloc_to_lru_ref(&trans, &iter));
 
-       /* first, put on free_inc and mark as owned by allocator: */
-       percpu_down_read(&c->mark_lock);
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
+}
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+                                  struct btree_iter *need_discard_iter,
+                                  struct bpos *discard_pos_done,
+                                  u64 *seen,
+                                  u64 *open,
+                                  u64 *need_journal_commit,
+                                  u64 *discarded)
+{
+       struct bch_fs *c = trans->c;
+       struct bpos pos = need_discard_iter->pos;
+       struct btree_iter iter = { NULL };
+       struct bkey_s_c k;
+       struct bch_dev *ca;
+       struct bkey_i_alloc_v4 *a;
+       struct printbuf buf = PRINTBUF;
+       bool did_discard = false;
+       int ret = 0;
 
-       bch2_mark_alloc_bucket(c, ca, b, true);
+       ca = bch_dev_bkey_exists(c, pos.inode);
+       if (!percpu_ref_tryget(&ca->io_ref)) {
+               bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+               return 0;
+       }
 
-       spin_lock(&c->freelist_lock);
-       verify_not_on_freelist(c, ca, b);
-       BUG_ON(!fifo_push(&ca->free_inc, b));
-       spin_unlock(&c->freelist_lock);
+       if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+               (*open)++;
+               goto out;
+       }
 
-       percpu_up_read(&c->mark_lock);
+       if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                       c->journal.flushed_seq_ondisk,
+                       pos.inode, pos.offset)) {
+               (*need_journal_commit)++;
+               goto out;
+       }
 
-       ret = bch2_trans_do(c, NULL, &commit_seq,
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_NOFAIL|
-                           BTREE_INSERT_JOURNAL_RESERVED|
-                           flags,
-                           bucket_invalidate_btree(&trans, ca, b, &u));
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            need_discard_iter->pos,
+                            BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
 
-       if (!ret) {
-               /* remove from alloc_heap: */
-               struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+       a = bch2_alloc_to_v4_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(a);
+       if (ret)
+               goto out;
 
-               top->bucket++;
-               top->nr--;
+       if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+               a->v.gen++;
+               SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+               goto write;
+       }
 
-               if (!top->nr)
-                       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+       if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans,
+                       "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+                       "%s",
+                       a->v.journal_seq,
+                       c->journal.flushed_seq_ondisk,
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = -EIO;
+               goto out;
+       }
 
-               /*
-                * If we invalidating cached data then we need to wait on the
-                * journal commit:
-                */
-               if (u.data_type)
-                       *journal_seq = max(*journal_seq, commit_seq);
+       if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans,
+                       "bucket incorrectly set in need_discard btree\n"
+                       "%s",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = -EIO;
+               goto out;
+       }
 
+       if (bkey_cmp(*discard_pos_done, iter.pos) &&
+           ca->mi.discard && !c->opts.nochanges) {
                /*
-                * We already waiting on u.alloc_seq when we filtered out
-                * buckets that need journal commit:
+                * This works without any other locks because this is the only
+                * thread that removes items from the need_discard tree
                 */
-               BUG_ON(*journal_seq > u.journal_seq);
-       } else {
-               size_t b2;
+               bch2_trans_unlock(trans);
+               blkdev_issue_discard(ca->disk_sb.bdev,
+                                    k.k->p.offset * ca->mi.bucket_size,
+                                    ca->mi.bucket_size,
+                                    GFP_KERNEL);
 
-               /* remove from free_inc: */
-               percpu_down_read(&c->mark_lock);
-               spin_lock(&c->freelist_lock);
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       goto out;
+       }
 
-               bch2_mark_alloc_bucket(c, ca, b, false);
+       *discard_pos_done = iter.pos;
+       did_discard = true;
 
-               BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-               BUG_ON(b != b2);
+       SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+       a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+       ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+       if (ret)
+               goto out;
 
-               spin_unlock(&c->freelist_lock);
-               percpu_up_read(&c->mark_lock);
+       if (did_discard) {
+               this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+               (*discarded)++;
        }
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       percpu_ref_put(&ca->io_ref);
+       printbuf_exit(&buf);
+       return ret;
+}
 
-       return ret < 0 ? ret : 0;
+static void bch2_do_discards_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+       struct bpos discard_pos_done = POS_MAX;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       /*
+        * We're doing the commit in bch2_discard_one_bucket instead of using
+        * for_each_btree_key_commit() so that we can increment counters after
+        * successful commit:
+        */
+       ret = for_each_btree_key2(&trans, iter,
+                       BTREE_ID_need_discard, POS_MIN, 0, k,
+               bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
+                                       &seen,
+                                       &open,
+                                       &need_journal_commit,
+                                       &discarded));
+
+       bch2_trans_exit(&trans);
+
+       if (need_journal_commit * 2 > seen)
+               bch2_journal_flush_async(&c->journal, NULL);
+
+       percpu_ref_put(&c->writes);
+
+       trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+                             bch2_err_str(ret));
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+       if (percpu_ref_tryget_live(&c->writes) &&
+           !queue_work(system_long_wq, &c->discard_work))
+               percpu_ref_put(&c->writes);
 }
 
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static int invalidate_one_bucket(struct btree_trans *trans,
+                                struct btree_iter *lru_iter, struct bkey_s_c k,
+                                unsigned dev_idx, s64 *nr_to_invalidate)
 {
-       u64 journal_seq = 0;
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter = { NULL };
+       struct bkey_i_alloc_v4 *a;
+       struct bpos bucket;
+       struct printbuf buf = PRINTBUF;
+       unsigned cached_sectors;
        int ret = 0;
 
-       /* Only use nowait if we've already invalidated at least one bucket: */
-       while (!ret &&
-              !fifo_full(&ca->free_inc) &&
-              ca->alloc_heap.used) {
-               if (kthread_should_stop()) {
-                       ret = 1;
-                       break;
+       if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
+               return 1;
+
+       if (k.k->type != KEY_TYPE_lru) {
+               prt_printf(&buf, "non lru key in lru btree:\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
                }
 
-               ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-                               (!fifo_empty(&ca->free_inc)
-                                ? BTREE_INSERT_NOWAIT : 0));
-               /*
-                * We only want to batch up invalidates when they're going to
-                * require flushing the journal:
-                */
-               if (!journal_seq)
-                       break;
+               goto out;
        }
 
-       /* If we used NOWAIT, don't return the error: */
-       if (!fifo_empty(&ca->free_inc))
-               ret = 0;
-       if (ret < 0)
-               bch_err(ca, "error invalidating buckets: %i", ret);
+       bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
+
+       a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+       ret = PTR_ERR_OR_ZERO(a);
        if (ret)
-               return ret;
+               goto out;
 
-       if (journal_seq)
-               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-       if (ret) {
-               bch_err(ca, "journal error: %i", ret);
-               return ret;
-       }
+       if (k.k->p.offset != alloc_lru_idx(a->v)) {
+               prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+               prt_printf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
 
-       return 0;
-}
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
+               }
 
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
-       if (ca->allocator_state != new_state) {
-               ca->allocator_state = new_state;
-               closure_wake_up(&ca->fs->freelist_wait);
+               goto out;
        }
+
+       if (!a->v.cached_sectors)
+               bch_err(c, "invalidating empty bucket, confused");
+
+       cached_sectors = a->v.cached_sectors;
+
+       SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+       a->v.gen++;
+       a->v.data_type          = 0;
+       a->v.dirty_sectors      = 0;
+       a->v.cached_sectors     = 0;
+       a->v.io_time[READ]      = atomic64_read(&c->io_clock[READ].now);
+       a->v.io_time[WRITE]     = atomic64_read(&c->io_clock[WRITE].now);
+
+       ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
+                               BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+       if (ret)
+               goto out;
+
+       trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+       --*nr_to_invalidate;
+out:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
 }
 
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+static void bch2_do_invalidates_work(struct work_struct *work)
 {
+       struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+       struct bch_dev *ca;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        unsigned i;
        int ret = 0;
 
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++) {
-               /*
-                * Don't strand buckets on the copygc freelist until
-                * after recovery is finished:
-                */
-               if (i == RESERVE_MOVINGGC &&
-                   !test_bit(BCH_FS_STARTED, &c->flags))
-                       continue;
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_member_device(ca, c, i) {
+               s64 nr_to_invalidate =
+                       should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+               ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
+                               POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
+                       invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
 
-               if (fifo_push(&ca->free[i], b)) {
-                       fifo_pop(&ca->free_inc, b);
-                       ret = 1;
+               if (ret < 0) {
+                       percpu_ref_put(&ca->ref);
                        break;
                }
        }
-       spin_unlock(&c->freelist_lock);
 
-       ca->allocator_state = ret
-               ? ALLOCATOR_running
-               : ALLOCATOR_blocked_full;
-       closure_wake_up(&c->freelist_wait);
-       return ret;
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
 }
 
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+void bch2_do_invalidates(struct bch_fs *c)
 {
-       if (ca->mi.discard &&
-           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-               blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-                                    ca->mi.bucket_size, GFP_NOFS, 0);
+       if (percpu_ref_tryget_live(&c->writes) &&
+           !queue_work(system_long_wq, &c->invalidate_work))
+               percpu_ref_put(&c->writes);
 }
 
-static bool allocator_thread_running(struct bch_dev *ca)
+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
+                                struct bkey_s_c k, struct bch_dev *ca)
 {
-       unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-               test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-               ? ALLOCATOR_running
-               : ALLOCATOR_stopped;
-       alloc_thread_set_state(ca, state);
-       return state == ALLOCATOR_running;
-}
+       struct bch_alloc_v4 a;
 
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
-       s64 available = dev_buckets_reclaimable(ca) -
-               (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-       bool ret = available > 0;
+       if (iter->pos.offset >= ca->mi.nbuckets)
+               return 1;
 
-       alloc_thread_set_state(ca, ret
-                              ? ALLOCATOR_running
-                              : ALLOCATOR_blocked);
-       return ret;
+       bch2_alloc_to_v4(k, &a);
+       return bch2_bucket_do_index(trans, k, &a, true);
 }
 
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct bch_dev *ca = arg;
-       struct bch_fs *c = ca->fs;
-       unsigned long gc_count = c->gc_count;
-       size_t nr;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_member *m;
        int ret;
 
-       set_freezable();
+       bch2_trans_init(&trans, c, 0, 0);
 
-       while (1) {
-               ret = kthread_wait_freezable(allocator_thread_running(ca));
-               if (ret)
-                       goto stop;
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+                       POS(ca->dev_idx, ca->mi.first_bucket),
+                       BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW,
+               bucket_freespace_init(&trans, &iter, k, ca));
 
-               while (!ca->alloc_heap.used) {
-                       cond_resched();
+       bch2_trans_exit(&trans);
 
-                       ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-                       if (ret)
-                               goto stop;
-
-                       gc_count = c->gc_count;
-                       nr = find_reclaimable_buckets(c, ca);
-
-                       if (!nr && ca->buckets_waiting_on_journal) {
-                               ret = bch2_journal_flush(&c->journal);
-                               if (ret)
-                                       goto stop;
-                       } else if (nr < (ca->mi.nbuckets >> 6) &&
-                                  ca->buckets_waiting_on_journal >= nr / 2) {
-                               bch2_journal_flush_async(&c->journal, NULL);
-                       }
+       if (ret < 0) {
+               bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+               return ret;
+       }
 
-                       if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-                            ca->inc_gen_really_needs_gc) &&
-                           c->gc_thread) {
-                               atomic_inc(&c->kick_gc);
-                               wake_up_process(c->gc_thread);
-                       }
+       mutex_lock(&c->sb_lock);
+       m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+       SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+       mutex_unlock(&c->sb_lock);
 
-                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-                                        ca->inc_gen_really_needs_gc);
-               }
+       return 0;
+}
 
-               ret = bch2_invalidate_buckets(c, ca);
-               if (ret)
-                       goto stop;
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int ret = 0;
+       bool doing_init = false;
 
-               while (!fifo_empty(&ca->free_inc)) {
-                       u64 b = fifo_peek(&ca->free_inc);
+       /*
+        * We can crash during the device add path, so we need to check this on
+        * every mount:
+        */
 
-                       discard_one_bucket(c, ca, b);
+       for_each_member_device(ca, c, i) {
+               if (ca->mi.freespace_initialized)
+                       continue;
 
-                       ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-                       if (ret)
-                               goto stop;
+               if (!doing_init) {
+                       bch_info(c, "initializing freespace");
+                       doing_init = true;
+               }
+
+               ret = bch2_dev_freespace_init(c, ca);
+               if (ret) {
+                       percpu_ref_put(&ca->ref);
+                       return ret;
                }
        }
-stop:
-       alloc_thread_set_state(ca, ALLOCATOR_stopped);
-       return 0;
+
+       if (doing_init) {
+               mutex_lock(&c->sb_lock);
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+
+               bch_verbose(c, "done initializing freespace");
+       }
+
+       return ret;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+                             size_t bucket_nr, int rw)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_i_alloc_v4 *a;
+       u64 now;
+       int ret = 0;
+
+       a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
+       ret = PTR_ERR_OR_ZERO(a);
+       if (ret)
+               return ret;
+
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (a->v.io_time[rw] == now)
+               goto out;
+
+       a->v.io_time[rw] = now;
+
+       ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
 }
 
 /* Startup/shutdown (ro/rw): */
@@ -902,7 +1403,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
        u64 capacity = 0, reserved_sectors = 0, gc_reserve;
        unsigned bucket_size_max = 0;
        unsigned long ra_pages = 0;
-       unsigned i, j;
+       unsigned i;
 
        lockdep_assert_held(&c->state_lock);
 
@@ -933,8 +1434,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                 * allocations for foreground writes must wait -
                 * not -ENOSPC calculations.
                 */
-               for (j = 0; j < RESERVE_NONE; j++)
-                       dev_reserve += ca->free[j].size;
+
+               dev_reserve += ca->nr_btree_reserve * 2;
+               dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 
                dev_reserve += 1;       /* btree write point */
                dev_reserve += 1;       /* copygc write point */
@@ -990,8 +1492,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
        unsigned i;
 
-       BUG_ON(ca->alloc_thread);
-
        /* First, remove device from allocation groups: */
 
        for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1065,62 +1565,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
                        set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
-       if (ca->alloc_thread)
-               closure_wait_event(&c->freelist_wait,
-                                  ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-       struct task_struct *p;
-
-       p = rcu_dereference_protected(ca->alloc_thread, 1);
-       ca->alloc_thread = NULL;
-
-       /*
-        * We need an rcu barrier between setting ca->alloc_thread = NULL and
-        * the thread shutting down to avoid bch2_wake_allocator() racing:
-        *
-        * XXX: it would be better to have the rcu barrier be asynchronous
-        * instead of blocking us here
-        */
-       synchronize_rcu();
-
-       if (p) {
-               kthread_stop(p);
-               put_task_struct(p);
-       }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-       struct task_struct *p;
-
-       /*
-        * allocator thread already started?
-        */
-       if (ca->alloc_thread)
-               return 0;
-
-       p = kthread_create(bch2_allocator_thread, ca,
-                          "bch-alloc/%s", ca->name);
-       if (IS_ERR(p)) {
-               bch_err(ca->fs, "error creating allocator thread: %li",
-                       PTR_ERR(p));
-               return PTR_ERR(p);
-       }
-
-       get_task_struct(p);
-       rcu_assign_pointer(ca->alloc_thread, p);
-       wake_up_process(p);
-       return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
+       INIT_WORK(&c->discard_work, bch2_do_discards_work);
+       INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
index 98c7866e20b57ded9f8d629d8427d5966f97bfb5..044bc72992d4186d551da21ba63defa09334da5e 100644 (file)
 #include "debug.h"
 #include "super.h"
 
-extern const char * const bch2_allocator_states[];
-
-struct bkey_alloc_unpacked {
-       u64             journal_seq;
-       u64             bucket;
-       u8              dev;
-       u8              gen;
-       u8              oldest_gen;
-       u8              data_type;
-#define x(_name, _bits)        u##_bits _name;
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-};
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX      96U
 
-/* returns true if not equal */
-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
-                                          struct bkey_alloc_unpacked r)
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+       struct bch_dev *ca;
+
+       if (!bch2_dev_exists2(c, pos.inode))
+               return false;
+
+       ca = bch_dev_bkey_exists(c, pos.inode);
+       return pos.offset >= ca->mi.first_bucket &&
+               pos.offset < ca->mi.nbuckets;
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+       return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+                                                  u32 cached_sectors,
+                                                  u32 stripe,
+                                                  struct bch_alloc_v4 a,
+                                                  enum bch_data_type data_type)
+{
+       if (dirty_sectors)
+               return data_type;
+       if (stripe)
+               return BCH_DATA_stripe;
+       if (cached_sectors)
+               return BCH_DATA_cached;
+       if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+               return BCH_DATA_need_discard;
+       if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+               return BCH_DATA_need_gc_gens;
+       return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+                                                enum bch_data_type data_type)
+{
+       return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+                                a.stripe, a, data_type);
+}
+
+static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+{
+       return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+       return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+       pos.offset |= alloc_freespace_genbits(a);
+       return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+       unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+                       BCH_ALLOC_V4_U64s_V0) +
+               BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+               (sizeof(struct bch_backpointer) / sizeof(u64));
+
+       BUG_ON(ret > U8_MAX - BKEY_U64s);
+       return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
 {
-       return  l.gen != r.gen                  ||
-               l.oldest_gen != r.oldest_gen    ||
-               l.data_type != r.data_type
-#define x(_name, ...)  || l._name != r._name
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-       ;
-}
-
-struct bkey_alloc_buf {
-       struct bkey_i   k;
-       struct bch_alloc_v3 v;
-
-#define x(_name,  _bits)               + _bits / 8
-       u8              _pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-} __attribute__((packed, aligned(8)));
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
-                                      const struct bkey_alloc_unpacked);
-int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
-                    struct bkey_alloc_unpacked *, unsigned);
+       set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
        .key_invalid    = bch2_alloc_v1_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
+       .atomic_trigger = bch2_mark_alloc,              \
 }
 
 #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {     \
        .key_invalid    = bch2_alloc_v2_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
+       .atomic_trigger = bch2_mark_alloc,              \
 }
 
 #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {     \
        .key_invalid    = bch2_alloc_v3_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
+       .atomic_trigger = bch2_mark_alloc,              \
+}
+
+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {     \
+       .key_invalid    = bch2_alloc_v4_invalid,        \
+       .val_to_text    = bch2_alloc_to_text,           \
+       .swab           = bch2_alloc_v4_swab,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
+       .atomic_trigger = bch2_mark_alloc,              \
 }
 
 static inline bool bkey_is_alloc(const struct bkey *k)
@@ -84,44 +139,45 @@ static inline bool bkey_is_alloc(const struct bkey *k)
                k->type == KEY_TYPE_alloc_v3;
 }
 
-int bch2_alloc_read(struct bch_fs *, bool, bool);
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+                         struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
 
-static inline void bch2_wake_allocator(struct bch_dev *ca)
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+                                           struct bch_dev_usage u)
 {
-       struct task_struct *p;
+       u64 want_free = ca->mi.nbuckets >> 7;
+       u64 free = max_t(s64, 0,
+                          u.d[BCH_DATA_free].buckets
+                        + u.d[BCH_DATA_need_discard].buckets
+                        - bch2_dev_buckets_reserved(ca, RESERVE_none));
 
-       rcu_read_lock();
-       p = rcu_dereference(ca->alloc_thread);
-       if (p)
-               wake_up_process(p);
-       rcu_read_unlock();
+       return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
 
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-                                         size_t bucket)
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
 {
-       if (bch2_expensive_debug_checks) {
-               size_t iter;
-               long i;
-               unsigned j;
+       return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
 
-               for (j = 0; j < RESERVE_NR; j++)
-                       fifo_for_each_entry(i, &ca->free[j], iter)
-                               BUG_ON(i == bucket);
-               fifo_for_each_entry(i, &ca->free_inc, iter)
-                       BUG_ON(i == bucket);
-       }
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+       return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
 }
 
+int bch2_fs_freespace_init(struct bch_fs *);
+
 void bch2_recalc_capacity(struct bch_fs *);
 
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index 9b81ed2665c8d93324d19dc1e5f8e5f0e4930eae..a9e0c7397292fcab2e6dbcac08b7dd335c6c0147 100644 (file)
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "error.h"
 #include "io.h"
+#include "journal.h"
+#include "movinggc.h"
 
 #include <linux/math64.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <trace/events/bcachefs.h>
 
+const char * const bch2_alloc_reserves[] = {
+#define x(t) #t,
+       BCH_ALLOC_RESERVES()
+#undef x
+       NULL
+};
+
 /*
  * Open buckets represent a bucket that's currently being allocated from.  They
  * serve two purposes:
@@ -78,7 +91,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        percpu_down_read(&c->mark_lock);
        spin_lock(&ob->lock);
 
-       bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
        ob->valid = false;
        ob->data_type = 0;
 
@@ -168,49 +180,45 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
        switch (reserve) {
-       case RESERVE_BTREE:
-       case RESERVE_BTREE_MOVINGGC:
+       case RESERVE_btree:
+       case RESERVE_btree_movinggc:
                return 0;
-       case RESERVE_MOVINGGC:
+       case RESERVE_movinggc:
                return OPEN_BUCKETS_COUNT / 4;
        default:
                return OPEN_BUCKETS_COUNT / 2;
        }
 }
 
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-                                     enum alloc_reserve reserve,
-                                     bool may_alloc_partial,
-                                     struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                             u64 bucket,
+                                             enum alloc_reserve reserve,
+                                             struct bch_alloc_v4 *a,
+                                             u64 *skipped_open,
+                                             u64 *skipped_need_journal_commit,
+                                             u64 *skipped_nouse,
+                                             struct closure *cl)
 {
        struct open_bucket *ob;
-       long b = 0;
 
-       spin_lock(&c->freelist_lock);
+       if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+               (*skipped_nouse)++;
+               return NULL;
+       }
 
-       if (may_alloc_partial) {
-               int i;
-
-               for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-                       ob = c->open_buckets + ca->open_buckets_partial[i];
-
-                       if (reserve <= ob->alloc_reserve) {
-                               array_remove_item(ca->open_buckets_partial,
-                                                 ca->open_buckets_partial_nr,
-                                                 i);
-                               ob->on_partial_list = false;
-                               ob->alloc_reserve = reserve;
-                               spin_unlock(&c->freelist_lock);
-                               return ob;
-                       }
-               }
+       if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+               (*skipped_open)++;
+               return NULL;
+       }
+
+       if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                       c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+               (*skipped_need_journal_commit)++;
+               return NULL;
        }
 
+       spin_lock(&c->freelist_lock);
+
        if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
                if (cl)
                        closure_wait(&c->open_buckets_wait, cl);
@@ -219,36 +227,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                        c->blocked_allocate_open_bucket = local_clock();
 
                spin_unlock(&c->freelist_lock);
-               trace_open_bucket_alloc_fail(ca, reserve);
-               return ERR_PTR(-OPEN_BUCKETS_EMPTY);
+               return ERR_PTR(-BCH_ERR_open_buckets_empty);
        }
 
-       if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
-               goto out;
-
-       switch (reserve) {
-       case RESERVE_BTREE_MOVINGGC:
-       case RESERVE_MOVINGGC:
-               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
-                       goto out;
-               break;
-       default:
-               break;
+       /* Recheck under lock: */
+       if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+               spin_unlock(&c->freelist_lock);
+               (*skipped_open)++;
+               return NULL;
        }
 
-       if (cl)
-               closure_wait(&c->freelist_wait, cl);
-
-       if (!c->blocked_allocate)
-               c->blocked_allocate = local_clock();
-
-       spin_unlock(&c->freelist_lock);
-
-       trace_bucket_alloc_fail(ca, reserve);
-       return ERR_PTR(-FREELIST_EMPTY);
-out:
-       verify_not_on_freelist(c, ca, b);
-
        ob = bch2_open_bucket_alloc(c);
 
        spin_lock(&ob->lock);
@@ -257,8 +245,8 @@ out:
        ob->sectors_free = ca->mi.bucket_size;
        ob->alloc_reserve = reserve;
        ob->dev         = ca->dev_idx;
-       ob->gen         = *bucket_gen(ca, b);
-       ob->bucket      = b;
+       ob->gen         = a->gen;
+       ob->bucket      = bucket;
        spin_unlock(&ob->lock);
 
        ca->nr_open_buckets++;
@@ -280,9 +268,343 @@ out:
 
        spin_unlock(&c->freelist_lock);
 
-       bch2_wake_allocator(ca);
+       return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+                                           enum alloc_reserve reserve, u64 free_entry,
+                                           u64 *skipped_open,
+                                           u64 *skipped_need_journal_commit,
+                                           u64 *skipped_nouse,
+                                           struct bkey_s_c freespace_k,
+                                           struct closure *cl)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter = { NULL };
+       struct bkey_s_c k;
+       struct open_bucket *ob;
+       struct bch_alloc_v4 a;
+       u64 b = free_entry & ~(~0ULL << 56);
+       unsigned genbits = free_entry >> 56;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+               prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+                      "  freespace key ",
+                       ca->mi.first_bucket, ca->mi.nbuckets);
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret) {
+               ob = ERR_PTR(ret);
+               goto err;
+       }
+
+       bch2_alloc_to_v4(k, &a);
+
+       if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+               prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+                      "  freespace key ",
+                      genbits, alloc_freespace_genbits(a) >> 56);
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               prt_printf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+               ob = ERR_PTR(-EIO);
+               goto err;
+
+       }
+
+       if (a.data_type != BCH_DATA_free) {
+               prt_printf(&buf, "non free bucket in freespace btree\n"
+                      "  freespace key ");
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               prt_printf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
+       ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+                               skipped_open,
+                               skipped_need_journal_commit,
+                               skipped_nouse,
+                               cl);
+       if (!ob)
+               iter.path->preserve = false;
+err:
+       set_btree_iter_dontneed(&iter);
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
+       return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                                   enum alloc_reserve reserve)
+{
+       struct open_bucket *ob;
+       int i;
+
+       spin_lock(&c->freelist_lock);
+
+       for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+               ob = c->open_buckets + ca->open_buckets_partial[i];
+
+               if (reserve <= ob->alloc_reserve) {
+                       array_remove_item(ca->open_buckets_partial,
+                                         ca->open_buckets_partial_nr,
+                                         i);
+                       ob->on_partial_list = false;
+                       ob->alloc_reserve = reserve;
+                       spin_unlock(&c->freelist_lock);
+                       return ob;
+               }
+       }
+
+       spin_unlock(&c->freelist_lock);
+       return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+                       struct bch_dev *ca,
+                       enum alloc_reserve reserve,
+                       u64 *cur_bucket,
+                       u64 *buckets_seen,
+                       u64 *skipped_open,
+                       u64 *skipped_need_journal_commit,
+                       u64 *skipped_nouse,
+                       struct closure *cl)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct open_bucket *ob = NULL;
+       int ret;
+
+       *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
+       *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
+                          BTREE_ITER_SLOTS, k, ret) {
+               struct bch_alloc_v4 a;
+
+               if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+                       break;
+
+               if (ca->new_fs_bucket_idx &&
+                   is_superblock_bucket(ca, k.k->p.offset))
+                       continue;
+
+               bch2_alloc_to_v4(k, &a);
+
+               if (a.data_type != BCH_DATA_free)
+                       continue;
+
+               (*buckets_seen)++;
+
+               ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+                                       skipped_open,
+                                       skipped_need_journal_commit,
+                                       skipped_nouse,
+                                       cl);
+               if (ob)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       *cur_bucket = iter.pos.offset;
+
+       return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+                                                  struct bch_dev *ca,
+                                                  enum alloc_reserve reserve,
+                                                  u64 *cur_bucket,
+                                                  u64 *buckets_seen,
+                                                  u64 *skipped_open,
+                                                  u64 *skipped_need_journal_commit,
+                                                  u64 *skipped_nouse,
+                                                  struct closure *cl)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct open_bucket *ob = NULL;
+       int ret;
+
+       BUG_ON(ca->new_fs_bucket_idx);
+
+       /*
+        * XXX:
+        * On transaction restart, we'd like to restart from the bucket we were
+        * at previously
+        */
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+                                    POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
+               if (k.k->p.inode != ca->dev_idx)
+                       break;
+
+               for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
+                    *cur_bucket < k.k->p.offset;
+                    (*cur_bucket)++) {
+                       ret = btree_trans_too_many_iters(trans);
+                       if (ret)
+                               break;
+
+                       (*buckets_seen)++;
+
+                       ob = try_alloc_bucket(trans, ca, reserve,
+                                             *cur_bucket,
+                                             skipped_open,
+                                             skipped_need_journal_commit,
+                                             skipped_nouse,
+                                             k, cl);
+                       if (ob)
+                               break;
+               }
+
+               if (ob || ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ob ?: ERR_PTR(ret);
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+                                     struct bch_dev *ca,
+                                     enum alloc_reserve reserve,
+                                     bool may_alloc_partial,
+                                     struct closure *cl,
+                                     struct bch_dev_usage *usage)
+{
+       struct bch_fs *c = trans->c;
+       struct open_bucket *ob = NULL;
+       bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
+       u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
+       u64 avail;
+       u64 cur_bucket = start;
+       u64 buckets_seen = 0;
+       u64 skipped_open = 0;
+       u64 skipped_need_journal_commit = 0;
+       u64 skipped_nouse = 0;
+       bool waiting = false;
+again:
+       bch2_dev_usage_read_fast(ca, usage);
+       avail = dev_buckets_free(ca, *usage, reserve);
+
+       if (usage->d[BCH_DATA_need_discard].buckets > avail)
+               bch2_do_discards(c);
+
+       if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+               bch2_do_gc_gens(c);
+
+       if (should_invalidate_buckets(ca, *usage))
+               bch2_do_invalidates(c);
+
+       if (!avail) {
+               if (cl && !waiting) {
+                       closure_wait(&c->freelist_wait, cl);
+                       waiting = true;
+                       goto again;
+               }
+
+               if (!c->blocked_allocate)
+                       c->blocked_allocate = local_clock();
+
+               ob = ERR_PTR(-BCH_ERR_freelist_empty);
+               goto err;
+       }
+
+       if (waiting)
+               closure_wake_up(&c->freelist_wait);
+
+       if (may_alloc_partial) {
+               ob = try_alloc_partial_bucket(c, ca, reserve);
+               if (ob)
+                       return ob;
+       }
+
+       ob = likely(ca->mi.freespace_initialized)
+               ? bch2_bucket_alloc_freelist(trans, ca, reserve,
+                                       &cur_bucket,
+                                       &buckets_seen,
+                                       &skipped_open,
+                                       &skipped_need_journal_commit,
+                                       &skipped_nouse,
+                                       cl)
+               : bch2_bucket_alloc_early(trans, ca, reserve,
+                                       &cur_bucket,
+                                       &buckets_seen,
+                                       &skipped_open,
+                                       &skipped_need_journal_commit,
+                                       &skipped_nouse,
+                                       cl);
+
+       if (skipped_need_journal_commit * 2 > avail)
+               bch2_journal_flush_async(&c->journal, NULL);
+
+       if (!ob && !freespace_initialized && start) {
+               start = cur_bucket = 0;
+               goto again;
+       }
+
+       if (!freespace_initialized)
+               ca->bucket_alloc_trans_early_cursor = cur_bucket;
+err:
+       if (!ob)
+               ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+       if (!IS_ERR(ob))
+               trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve],
+                               may_alloc_partial, ob->bucket);
+       else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+               trace_and_count(c, bucket_alloc_fail,
+                               ca, bch2_alloc_reserves[reserve],
+                               usage->d[BCH_DATA_free].buckets,
+                               avail,
+                               bch2_copygc_wait_amount(c),
+                               c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+                               buckets_seen,
+                               skipped_open,
+                               skipped_need_journal_commit,
+                               skipped_nouse,
+                               cl == NULL,
+                               bch2_err_str(PTR_ERR(ob)));
 
-       trace_bucket_alloc(ca, reserve);
+       return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+                                     enum alloc_reserve reserve,
+                                     bool may_alloc_partial,
+                                     struct closure *cl)
+{
+       struct bch_dev_usage usage;
+       struct open_bucket *ob;
+
+       bch2_trans_do(c, NULL, NULL, 0,
+                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+                                                       may_alloc_partial, cl, &usage)));
        return ob;
 }
 
@@ -309,11 +631,12 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
        return ret;
 }
 
-void bch2_dev_stripe_increment(struct bch_dev *ca,
-                              struct dev_stripe_state *stripe)
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+                              struct dev_stripe_state *stripe,
+                              struct bch_dev_usage *usage)
 {
        u64 *v = stripe->next_alloc + ca->dev_idx;
-       u64 free_space = dev_buckets_available(ca);
+       u64 free_space = dev_buckets_available(ca, RESERVE_none);
        u64 free_space_inv = free_space
                ? div64_u64(1ULL << 48, free_space)
                : 1ULL << 48;
@@ -329,6 +652,15 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
                *v = *v < scale ? 0 : *v - scale;
 }
 
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+                              struct dev_stripe_state *stripe)
+{
+       struct bch_dev_usage usage;
+
+       bch2_dev_usage_read_fast(ca, &usage);
+       bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
 #define BUCKET_MAY_ALLOC_PARTIAL       (1 << 0)
 #define BUCKET_ALLOC_USE_DURABILITY    (1 << 1)
 
@@ -351,7 +683,7 @@ static void add_new_bucket(struct bch_fs *c,
        ob_push(c, ptrs, ob);
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *c,
+static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
                      struct open_buckets *ptrs,
                      struct dev_stripe_state *stripe,
                      struct bch_devs_mask *devs_may_alloc,
@@ -362,46 +694,79 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
                      unsigned flags,
                      struct closure *cl)
 {
+       struct bch_fs *c = trans->c;
        struct dev_alloc_list devs_sorted =
                bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+       unsigned dev;
        struct bch_dev *ca;
-       int ret = -INSUFFICIENT_DEVICES;
+       int ret = -BCH_ERR_insufficient_devices;
        unsigned i;
 
        BUG_ON(*nr_effective >= nr_replicas);
 
        for (i = 0; i < devs_sorted.nr; i++) {
+               struct bch_dev_usage usage;
                struct open_bucket *ob;
 
-               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+               dev = devs_sorted.devs[i];
+
+               rcu_read_lock();
+               ca = rcu_dereference(c->devs[dev]);
+               if (ca)
+                       percpu_ref_get(&ca->ref);
+               rcu_read_unlock();
+
                if (!ca)
                        continue;
 
-               if (!ca->mi.durability && *have_cache)
+               if (!ca->mi.durability && *have_cache) {
+                       percpu_ref_put(&ca->ref);
                        continue;
+               }
+
+               ob = bch2_bucket_alloc_trans(trans, ca, reserve,
+                               flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
+               if (!IS_ERR(ob))
+                       bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+               percpu_ref_put(&ca->ref);
 
-               ob = bch2_bucket_alloc(c, ca, reserve,
-                               flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
                if (IS_ERR(ob)) {
                        ret = PTR_ERR(ob);
-
-                       if (cl)
-                               return ret;
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+                               break;
                        continue;
                }
 
                add_new_bucket(c, ptrs, devs_may_alloc,
                               nr_effective, have_cache, flags, ob);
 
-               bch2_dev_stripe_increment(ca, stripe);
-
-               if (*nr_effective >= nr_replicas)
-                       return 0;
+               if (*nr_effective >= nr_replicas) {
+                       ret = 0;
+                       break;
+               }
        }
 
        return ret;
 }
 
+int bch2_bucket_alloc_set(struct bch_fs *c,
+                     struct open_buckets *ptrs,
+                     struct dev_stripe_state *stripe,
+                     struct bch_devs_mask *devs_may_alloc,
+                     unsigned nr_replicas,
+                     unsigned *nr_effective,
+                     bool *have_cache,
+                     enum alloc_reserve reserve,
+                     unsigned flags,
+                     struct closure *cl)
+{
+       return bch2_trans_do(c, NULL, NULL, 0,
+                     bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
+                                             devs_may_alloc, nr_replicas,
+                                             nr_effective, have_cache, reserve,
+                                             flags, cl));
+}
+
 /* Allocate from stripes: */
 
 /*
@@ -506,7 +871,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
        wp->ptrs = ptrs_skip;
 }
 
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct btree_trans *trans,
                        struct open_buckets *ptrs,
                        struct write_point *wp,
                        struct bch_devs_list *devs_have,
@@ -519,6 +884,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
                        unsigned flags,
                        struct closure *_cl)
 {
+       struct bch_fs *c = trans->c;
        struct bch_devs_mask devs;
        struct open_bucket *ob;
        struct closure *cl = NULL;
@@ -550,8 +916,9 @@ static int open_bucket_add_buckets(struct bch_fs *c,
                                                 target, erasure_code,
                                                 nr_replicas, nr_effective,
                                                 have_cache, flags, _cl);
-                       if (ret == -FREELIST_EMPTY ||
-                           ret == -OPEN_BUCKETS_EMPTY)
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+                           bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+                           bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
                                return ret;
                        if (*nr_effective >= nr_replicas)
                                return 0;
@@ -564,25 +931,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
        if (*nr_effective >= nr_replicas)
                return 0;
 
-       percpu_down_read(&c->mark_lock);
-       rcu_read_lock();
-
 retry_blocking:
        /*
         * Try nonblocking first, so that if one device is full we'll try from
         * other devices:
         */
-       ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+       ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
                                nr_replicas, nr_effective, have_cache,
                                reserve, flags, cl);
-       if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
+       if (ret &&
+           !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+           !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+           !cl && _cl) {
                cl = _cl;
                goto retry_blocking;
        }
 
-       rcu_read_unlock();
-       percpu_up_read(&c->mark_lock);
-
        return ret;
 }
 
@@ -696,15 +1060,25 @@ static bool try_decrease_writepoints(struct bch_fs *c,
        return true;
 }
 
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock(struct btree_trans *trans,
+                                 struct mutex *lock)
+{
+       if (!mutex_trylock(lock)) {
+               bch2_trans_unlock(trans);
+               mutex_lock(lock);
+       }
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
                                           unsigned long write_point)
 {
+       struct bch_fs *c = trans->c;
        struct write_point *wp, *oldest;
        struct hlist_head *head;
 
        if (!(write_point & 1UL)) {
                wp = (struct write_point *) write_point;
-               mutex_lock(&wp->lock);
+               bch2_trans_mutex_lock(trans, &wp->lock);
                return wp;
        }
 
@@ -713,7 +1087,7 @@ restart_find:
        wp = __writepoint_find(head, write_point);
        if (wp) {
 lock_wp:
-               mutex_lock(&wp->lock);
+               bch2_trans_mutex_lock(trans, &wp->lock);
                if (wp->write_point == write_point)
                        goto out;
                mutex_unlock(&wp->lock);
@@ -726,8 +1100,8 @@ restart_find_oldest:
                if (!oldest || time_before64(wp->last_used, oldest->last_used))
                        oldest = wp;
 
-       mutex_lock(&oldest->lock);
-       mutex_lock(&c->write_points_hash_lock);
+       bch2_trans_mutex_lock(trans, &oldest->lock);
+       bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
        if (oldest >= c->write_points + c->write_points_nr ||
            try_increase_writepoints(c)) {
                mutex_unlock(&c->write_points_hash_lock);
@@ -748,14 +1122,14 @@ restart_find_oldest:
        hlist_add_head_rcu(&wp->node, head);
        mutex_unlock(&c->write_points_hash_lock);
 out:
-       wp->last_used = sched_clock();
+       wp->last_used = local_clock();
        return wp;
 }
 
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
                                unsigned target,
                                unsigned erasure_code,
                                struct write_point_specifier write_point,
@@ -766,6 +1140,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
                                unsigned flags,
                                struct closure *cl)
 {
+       struct bch_fs *c = trans->c;
        struct write_point *wp;
        struct open_bucket *ob;
        struct open_buckets ptrs;
@@ -785,7 +1160,7 @@ retry:
        write_points_nr = c->write_points_nr;
        have_cache      = false;
 
-       wp = writepoint_find(c, write_point.v);
+       wp = writepoint_find(trans, write_point.v);
 
        if (wp->data_type == BCH_DATA_user)
                ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -795,21 +1170,22 @@ retry:
                have_cache = true;
 
        if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+               ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
                                              ob_flags, cl);
        } else {
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+               ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
                                              ob_flags, NULL);
-               if (!ret)
+               if (!ret ||
+                   bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto alloc_done;
 
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+               ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              0, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
@@ -821,7 +1197,7 @@ alloc_done:
        if (erasure_code && !ec_open_bucket(c, &ptrs))
                pr_debug("failed to get ec bucket: ret %u", ret);
 
-       if (ret == -INSUFFICIENT_DEVICES &&
+       if (ret == -BCH_ERR_insufficient_devices &&
            nr_effective >= nr_replicas_required)
                ret = 0;
 
@@ -852,19 +1228,46 @@ err:
 
        mutex_unlock(&wp->lock);
 
-       if (ret == -FREELIST_EMPTY &&
+       if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
            try_decrease_writepoints(c, write_points_nr))
                goto retry;
 
-       switch (ret) {
-       case -OPEN_BUCKETS_EMPTY:
-       case -FREELIST_EMPTY:
-               return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
-       case -INSUFFICIENT_DEVICES:
+       if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+           bch2_err_matches(ret, BCH_ERR_freelist_empty))
+               return cl
+                       ? ERR_PTR(-EAGAIN)
+                       : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc);
+
+       if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
                return ERR_PTR(-EROFS);
-       default:
-               BUG();
-       }
+
+       return ERR_PTR(ret);
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+                               unsigned target,
+                               unsigned erasure_code,
+                               struct write_point_specifier write_point,
+                               struct bch_devs_list *devs_have,
+                               unsigned nr_replicas,
+                               unsigned nr_replicas_required,
+                               enum alloc_reserve reserve,
+                               unsigned flags,
+                               struct closure *cl)
+{
+       struct write_point *wp;
+
+       bch2_trans_do(c, NULL, NULL, 0,
+                     PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
+                                                       erasure_code,
+                                                       write_point,
+                                                       devs_have,
+                                                       nr_replicas,
+                                                       nr_replicas_required,
+                                                       reserve,
+                                                       flags, cl)));
+       return wp;
+
 }
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@@ -965,7 +1368,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
             wp < c->write_points + c->write_points_nr; wp++) {
                writepoint_init(wp, BCH_DATA_user);
 
-               wp->last_used   = sched_clock();
+               wp->last_used   = local_clock();
                wp->write_point = (unsigned long) wp;
                hlist_add_head_rcu(&wp->node,
                                   writepoint_hash(c, wp->write_point));
@@ -981,12 +1384,12 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
             ob++) {
                spin_lock(&ob->lock);
                if (ob->valid && !ob->on_partial_list) {
-                       pr_buf(out, "%zu ref %u type %s\n",
+                       prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n",
                               ob - c->open_buckets,
                               atomic_read(&ob->pin),
-                              bch2_data_types[ob->data_type]);
+                              bch2_data_types[ob->data_type],
+                              ob->dev, ob->bucket, ob->gen);
                }
                spin_unlock(&ob->lock);
        }
-
 }
index d466bda9afc8fdddb49f7b353c8c571b12f1fcf6..6de63a351fa881f2547bb01e0f5ee593bfbb8410 100644 (file)
@@ -12,6 +12,8 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
+extern const char * const bch2_alloc_reserves[];
+
 struct dev_alloc_list {
        unsigned        nr;
        u8              devs[BCH_SB_MEMBERS_MAX];
@@ -115,11 +117,33 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
        return false;
 }
 
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+       bool ret;
+
+       if (bch2_bucket_is_open(c, dev, bucket))
+               return true;
+
+       spin_lock(&c->freelist_lock);
+       ret = bch2_bucket_is_open(c, dev, bucket);
+       spin_unlock(&c->freelist_lock);
+
+       return ret;
+}
+
 int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
                      struct dev_stripe_state *, struct bch_devs_mask *,
                      unsigned, unsigned *, bool *, enum alloc_reserve,
                      unsigned, struct closure *);
 
+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
+                                            unsigned, unsigned,
+                                            struct write_point_specifier,
+                                            struct bch_devs_list *,
+                                            unsigned, unsigned,
+                                            enum alloc_reserve,
+                                            unsigned,
+                                            struct closure *);
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
                                             unsigned, unsigned,
                                             struct write_point_specifier,
index 409232e3d99800ef652ce6fcd8b2cf0a2e6476b9..e078584d46f6562372bccc0303ed71109cb3bcee 100644 (file)
 
 struct ec_bucket_buf;
 
-#define ALLOC_THREAD_STATES()          \
-       x(stopped)                      \
-       x(running)                      \
-       x(blocked)                      \
-       x(blocked_full)
-
-enum allocator_states {
-#define x(n)   ALLOCATOR_##n,
-       ALLOC_THREAD_STATES()
-#undef x
-};
+#define BCH_ALLOC_RESERVES()           \
+       x(btree_movinggc)               \
+       x(btree)                        \
+       x(movinggc)                     \
+       x(none)
 
 enum alloc_reserve {
-       RESERVE_BTREE_MOVINGGC  = -2,
-       RESERVE_BTREE           = -1,
-       RESERVE_MOVINGGC        = 0,
-       RESERVE_NONE            = 1,
-       RESERVE_NR              = 2,
+#define x(name)        RESERVE_##name,
+       BCH_ALLOC_RESERVES()
+#undef x
 };
 
-typedef FIFO(long)     alloc_fifo;
-
 #define OPEN_BUCKETS_COUNT     1024
 
 #define WRITE_POINT_HASH_NR    32
@@ -53,14 +43,14 @@ struct open_bucket {
         * the block in the stripe this open_bucket corresponds to:
         */
        u8                      ec_idx;
-       enum bch_data_type      data_type:3;
+       enum bch_data_type      data_type:8;
        unsigned                valid:1;
        unsigned                on_partial_list:1;
-       int                     alloc_reserve:3;
+       unsigned                alloc_reserve:3;
 
-       unsigned                sectors_free;
        u8                      dev;
        u8                      gen;
+       u32                     sectors_free;
        u64                     bucket;
        struct ec_stripe_new    *ec;
 };
@@ -94,12 +84,4 @@ struct write_point_specifier {
        unsigned long           v;
 };
 
-struct alloc_heap_entry {
-       size_t                  bucket;
-       size_t                  nr;
-       unsigned long           key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
 #endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
new file mode 100644 (file)
index 0000000..d74de1d
--- /dev/null
@@ -0,0 +1,1128 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT                10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+                                          struct bpos bp_pos)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+       u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+       return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+                                          struct bpos bucket,
+                                          u64 bucket_offset)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+       struct bpos ret;
+
+       ret = POS(bucket.inode,
+                 (bucket_to_sector(ca, bucket.offset) <<
+                  MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+       BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret)));
+
+       return ret;
+}
+
+void bch2_extent_ptr_to_bp(struct bch_fs *c,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c k, struct extent_ptr_decoded p,
+                          struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+       enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+       s64 sectors = level ? btree_sectors(c) : k.k->size;
+       u32 bucket_offset;
+
+       *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+       *bp = (struct bch_backpointer) {
+               .btree_id       = btree_id,
+               .level          = level,
+               .data_type      = data_type,
+               .bucket_offset  = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+                       p.crc.offset,
+               .bucket_len     = ptr_disk_sectors(sectors, p),
+               .pos            = k.k->p,
+       };
+}
+
+static bool extent_matches_bp(struct bch_fs *c,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c k,
+                             struct bpos bucket,
+                             struct bch_backpointer bp)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket2;
+               struct bch_backpointer bp2;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+                                     &bucket2, &bp2);
+               if (!bpos_cmp(bucket, bucket2) &&
+                   !memcmp(&bp, &bp2, sizeof(bp)))
+                       return true;
+       }
+
+       return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                            int rw, struct printbuf *err)
+{
+       struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+       struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+       if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+               prt_str(err, "incorrect value size");
+               return -EINVAL;
+       }
+
+       if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+               prt_str(err, "backpointer at wrong pos");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+       prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+              bch2_btree_ids[bp->btree_id],
+              bp->level,
+              (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+              (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+              bp->bucket_len);
+       bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+       struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+       bp.v->bucket_offset     = swab32(bp.v->bucket_offset);
+       bp.v->bucket_len        = swab32(bp.v->bucket_len);
+       bch2_bpos_swab(&bp.v->pos);
+}
+
+#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1)
+
+static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
+{
+       return cmp_int(l.bucket_offset, r.bucket_offset);
+}
+
+static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
+                                         struct bpos bucket,
+                                         u64 bp_offset,
+                                         struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       if (bp_offset < BACKPOINTER_OFFSET_MAX) {
+               struct bch_backpointer *bps;
+               struct bkey_i_alloc_v4 *a;
+               unsigned i, nr;
+
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                                    bucket,
+                                    BTREE_ITER_INTENT|
+                                    BTREE_ITER_SLOTS|
+                                    BTREE_ITER_WITH_UPDATES);
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_alloc_v4) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               a = bch2_alloc_to_v4_mut(trans, k);
+               ret = PTR_ERR_OR_ZERO(a);
+               if (ret)
+                       goto err;
+               bps = alloc_v4_backpointers(&a->v);
+               nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+               for (i = 0; i < nr; i++) {
+                       if (bps[i].bucket_offset == bp_offset)
+                               goto found;
+                       if (bps[i].bucket_offset > bp_offset)
+                               break;
+               }
+
+               ret = -ENOENT;
+               goto err;
+found:
+               if (memcmp(&bps[i], &bp, sizeof(bp))) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+               array_remove_item(bps, nr, i);
+               SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+               set_alloc_v4_u64s(a);
+               ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+       } else {
+               bp_offset -= BACKPOINTER_OFFSET_MAX;
+
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
+                                    bucket_pos_to_bp(c, bucket, bp_offset),
+                                    BTREE_ITER_INTENT|
+                                    BTREE_ITER_SLOTS|
+                                    BTREE_ITER_WITH_UPDATES);
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_backpointer ||
+                   memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               ret = bch2_btree_delete_at(trans, &iter, 0);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_bucket_backpointer_del(struct btree_trans *trans,
+                               struct bkey_i_alloc_v4 *a,
+                               struct bch_backpointer bp,
+                               struct bkey_s_c orig_k)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+       unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+       struct btree_iter bp_iter;
+       struct bkey_s_c k;
+       int ret;
+
+       for (i = 0; i < nr; i++) {
+               int cmp = backpointer_cmp(bps[i], bp) ?:
+                       memcmp(&bps[i], &bp, sizeof(bp));
+               if (!cmp)
+                       goto found;
+               if (cmp >= 0)
+                       break;
+       }
+
+       goto btree;
+found:
+       array_remove_item(bps, nr, i);
+       SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+       set_alloc_v4_u64s(a);
+       return 0;
+btree:
+       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+                            bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_SLOTS|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&bp_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_backpointer ||
+           memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+               struct printbuf buf = PRINTBUF;
+
+               prt_printf(&buf, "backpointer not found when deleting");
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               prt_printf(&buf, "searching for ");
+               bch2_backpointer_to_text(&buf, &bp);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "got ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               prt_newline(&buf);
+
+               prt_str(&buf, "alloc ");
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+               prt_newline(&buf);
+
+               prt_printf(&buf, "for ");
+               bch2_bkey_val_to_text(&buf, c, orig_k);
+
+               if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+               } else {
+                       ret = -EIO;
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+               }
+               printbuf_exit(&buf);
+               goto err;
+       }
+
+       ret = bch2_btree_delete_at(trans, &bp_iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       return ret;
+}
+
+int bch2_bucket_backpointer_add(struct btree_trans *trans,
+                               struct bkey_i_alloc_v4 *a,
+                               struct bch_backpointer bp,
+                               struct bkey_s_c orig_k)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca;
+       struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+       unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+       struct bkey_i_backpointer *bp_k;
+       struct btree_iter bp_iter;
+       struct bkey_s_c k;
+       int ret;
+
+       /* Check for duplicates: */
+       for (i = 0; i < nr; i++) {
+               int cmp = backpointer_cmp(bps[i], bp);
+               if (cmp >= 0)
+                       break;
+       }
+
+       if ((i &&
+            (bps[i - 1].bucket_offset +
+             bps[i - 1].bucket_len > bp.bucket_offset)) ||
+           (i < nr &&
+            (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) {
+               struct printbuf buf = PRINTBUF;
+
+               prt_printf(&buf, "overlapping backpointer found when inserting ");
+               bch2_backpointer_to_text(&buf, &bp);
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               prt_printf(&buf, "into ");
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+               prt_newline(&buf);
+
+               prt_printf(&buf, "for ");
+               bch2_bkey_val_to_text(&buf, c, orig_k);
+
+               if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+                       bch_err(c, "%s", buf.buf);
+               else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       printbuf_exit(&buf);
+                       return -EIO;
+               }
+       }
+
+       if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) {
+               array_insert_item(bps, nr, i, bp);
+               SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+               set_alloc_v4_u64s(a);
+               return 0;
+       }
+
+       /* Overflow: use backpointer btree */
+       bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k));
+       ret = PTR_ERR_OR_ZERO(bp_k);
+       if (ret)
+               return ret;
+
+       ca = bch_dev_bkey_exists(c, a->k.p.inode);
+
+       bkey_backpointer_init(&bp_k->k_i);
+       bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+       bp_k->v = bp;
+
+       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_SLOTS|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&bp_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type) {
+               struct printbuf buf = PRINTBUF;
+
+               prt_printf(&buf, "existing btree backpointer key found when inserting ");
+               bch2_backpointer_to_text(&buf, &bp);
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               prt_printf(&buf, "found ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "for ");
+               bch2_bkey_val_to_text(&buf, c, orig_k);
+
+               if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+                       bch_err(c, "%s", buf.buf);
+               else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       printbuf_exit(&buf);
+                       ret = -EIO;
+                       goto err;
+               }
+       }
+
+       ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+                             struct bpos bucket, int gen,
+                             u64 *bp_offset,
+                             struct bch_backpointer *dst,
+                             unsigned iter_flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bpos bp_pos, bp_end_pos;
+       struct btree_iter alloc_iter, bp_iter = { NULL };
+       struct bkey_s_c k;
+       struct bkey_s_c_alloc_v4 a;
+       size_t i;
+       int ret;
+
+       if (*bp_offset == U64_MAX)
+               return 0;
+
+       bp_pos = bucket_pos_to_bp(c, bucket,
+                                 max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+       bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                            bucket, BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
+
+       if (k.k->type != KEY_TYPE_alloc_v4)
+               goto done;
+
+       a = bkey_s_c_to_alloc_v4(k);
+       if (gen >= 0 && a.v->gen != gen)
+               goto done;
+
+       for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
+               if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
+                       continue;
+
+               *dst = alloc_v4_backpointers_c(a.v)[i];
+               *bp_offset = dst->bucket_offset;
+               goto out;
+       }
+
+       for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+                                    bp_pos, 0, k, ret) {
+               if (bpos_cmp(k.k->p, bp_end_pos) >= 0)
+                       break;
+
+               if (k.k->type != KEY_TYPE_backpointer)
+                       continue;
+
+               *dst = *bkey_s_c_to_backpointer(k).v;
+               *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+               goto out;
+       }
+done:
+       *bp_offset = U64_MAX;
+out:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+                                 struct bpos bucket,
+                                 u64 bp_offset,
+                                 struct bch_backpointer bp,
+                                 struct bkey_s_c k,
+                                 const char *thing_it_points_to)
+{
+       struct bch_fs *c = trans->c;
+       struct printbuf buf = PRINTBUF;
+
+       prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+                  thing_it_points_to);
+       prt_printf(&buf, "bucket: ");
+       bch2_bpos_to_text(&buf, bucket);
+       prt_printf(&buf, "\n  ");
+
+       if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
+               struct bpos bp_pos =
+                       bucket_pos_to_bp(c, bucket,
+                                       bp_offset - BACKPOINTER_OFFSET_MAX);
+               prt_printf(&buf, "backpointer pos: ");
+               bch2_bpos_to_text(&buf, bp_pos);
+               prt_printf(&buf, "\n  ");
+       }
+
+       bch2_backpointer_to_text(&buf, &bp);
+       prt_printf(&buf, "\n  ");
+       bch2_bkey_val_to_text(&buf, c, k);
+       if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+               bch_err_ratelimited(c, "%s", buf.buf);
+       else
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+       printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+                                        struct btree_iter *iter,
+                                        struct bpos bucket,
+                                        u64 bp_offset,
+                                        struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+
+       bch2_trans_node_iter_init(trans, iter,
+                                 bp.btree_id,
+                                 bp.pos,
+                                 0,
+                                 min(bp.level, c->btree_roots[bp.btree_id].level),
+                                 0);
+       k = bch2_btree_iter_peek_slot(iter);
+       if (bkey_err(k)) {
+               bch2_trans_iter_exit(trans, iter);
+               return k;
+       }
+
+       if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+               k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+       if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+               return k;
+
+       bch2_trans_iter_exit(trans, iter);
+
+       if (bp.level) {
+               struct btree *b;
+
+               /*
+                * If a backpointer for a btree node wasn't found, it may be
+                * because it was overwritten by a new btree node that hasn't
+                * been written out yet - backpointer_get_node() checks for
+                * this:
+                */
+               b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+               if (!IS_ERR_OR_NULL(b))
+                       return bkey_i_to_s_c(&b->key);
+
+               bch2_trans_iter_exit(trans, iter);
+
+               if (IS_ERR(b))
+                       return bkey_s_c_err(PTR_ERR(b));
+               return bkey_s_c_null;
+       }
+
+       backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+       return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+                                       struct btree_iter *iter,
+                                       struct bpos bucket,
+                                       u64 bp_offset,
+                                       struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct btree *b;
+
+       BUG_ON(!bp.level);
+
+       bch2_trans_node_iter_init(trans, iter,
+                                 bp.btree_id,
+                                 bp.pos,
+                                 0,
+                                 bp.level - 1,
+                                 0);
+       b = bch2_btree_iter_peek_node(iter);
+       if (IS_ERR(b))
+               goto err;
+
+       if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+                                  bkey_i_to_s_c(&b->key),
+                                  bucket, bp))
+               return b;
+
+       if (b && btree_node_will_make_reachable(b)) {
+               b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+       } else {
+               backpointer_not_found(trans, bucket, bp_offset, bp,
+                                     bkey_i_to_s_c(&b->key), "btree node");
+               b = NULL;
+       }
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+                                       struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter = { NULL };
+       struct bch_dev *ca;
+       struct bkey_s_c alloc_k;
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+                       "backpointer for mising device:\n%s",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, bp_iter, 0);
+               goto out;
+       }
+
+       ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                            bp_pos_to_bucket(c, k.k->p), 0);
+
+       alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               goto out;
+
+       if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+                       "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+                       alloc_iter.pos.inode, alloc_iter.pos.offset,
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, bp_iter, 0);
+               goto out;
+       }
+out:
+fsck_err:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+
+       return bch2_trans_run(c,
+               for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_backpointers, POS_MIN, 0, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                 bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+                          struct bpos bucket_pos,
+                          struct bch_backpointer bp,
+                          struct bkey_s_c orig_k,
+                          struct bpos bucket_start,
+                          struct bpos bucket_end)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter, bp_iter = { NULL };
+       struct printbuf buf = PRINTBUF;
+       struct bkey_s_c alloc_k, bp_k;
+       int ret;
+
+       if (bpos_cmp(bucket_pos, bucket_start) < 0 ||
+           bpos_cmp(bucket_pos, bucket_end) > 0)
+               return 0;
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
+       alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               goto err;
+
+       if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
+               struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
+               const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
+               unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
+
+               for (i = 0; i < nr; i++) {
+                       int cmp = backpointer_cmp(bps[i], bp) ?:
+                               memcmp(&bps[i], &bp, sizeof(bp));
+                       if (!cmp)
+                               goto out;
+                       if (cmp >= 0)
+                               break;
+               }
+       } else {
+               goto missing;
+       }
+
+       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+                            bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+                            0);
+       bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+       ret = bkey_err(bp_k);
+       if (ret)
+               goto err;
+
+       if (bp_k.k->type != KEY_TYPE_backpointer ||
+           memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+               goto missing;
+out:
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+missing:
+       prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+              bch2_btree_ids[bp.btree_id], bp.level);
+       bch2_bkey_val_to_text(&buf, c, orig_k);
+       prt_printf(&buf, "\nin alloc key ");
+       bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+       if (c->sb.version < bcachefs_metadata_version_backpointers ||
+           c->opts.reconstruct_alloc ||
+           fsck_err(c, "%s", buf.buf)) {
+               struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
+
+               ret   = PTR_ERR_OR_ZERO(a) ?:
+                       bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?:
+                       bch2_trans_update(trans, &alloc_iter, &a->k_i, 0);
+       }
+
+       goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+                                       struct btree_iter *iter,
+                                       struct bpos bucket_start,
+                                       struct bpos bucket_end)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_ptrs_c ptrs;
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       struct bkey_s_c k;
+       int ret;
+
+       k = bch2_btree_iter_peek_all_levels(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+       if (!k.k)
+               return 0;
+
+       ptrs = bch2_bkey_ptrs_c(k);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket_pos;
+               struct bch_backpointer bp;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+                                     k, p, &bucket_pos, &bp);
+
+               ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+                                           enum btree_id btree_id,
+                                           struct bpos bucket_start,
+                                           struct bpos bucket_end)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct btree *b;
+       struct bkey_s_c k;
+       struct bkey_ptrs_c ptrs;
+       struct extent_ptr_decoded p;
+       const union bch_extent_entry *entry;
+       int ret;
+
+       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+                                 c->btree_roots[btree_id].level, 0);
+       b = bch2_btree_iter_peek_node(&iter);
+       ret = PTR_ERR_OR_ZERO(b);
+       if (ret)
+               goto err;
+
+       BUG_ON(b != btree_node_root(c, b));
+
+       k = bkey_i_to_s_c(&b->key);
+       ptrs = bch2_bkey_ptrs_c(k);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket_pos;
+               struct bch_backpointer bp;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+                                     k, p, &bucket_pos, &bp);
+
+               ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+               if (ret)
+                       goto err;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+       return (struct bbpos) {
+               .btree  = bp.btree_id,
+               .pos    = bp.pos,
+       };
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+       struct sysinfo i;
+       u64 mem_bytes;
+
+       si_meminfo(&i);
+       mem_bytes = i.totalram * i.mem_unit;
+       return (mem_bytes >> 1) / btree_bytes(c);
+}
+
+int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+                                unsigned btree_leaf_mask,
+                                unsigned btree_interior_mask,
+                                struct bbpos start, struct bbpos *end)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+       enum btree_id btree;
+       int ret = 0;
+
+       for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+               unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+               if (!((1U << btree) & btree_leaf_mask) &&
+                   !((1U << btree) & btree_interior_mask))
+                       continue;
+
+               bch2_trans_node_iter_init(trans, &iter, btree,
+                                         btree == start.btree ? start.pos : POS_MIN,
+                                         0, depth, 0);
+               /*
+                * for_each_btree_key_contineu() doesn't check the return value
+                * from bch2_btree_iter_advance(), which is needed when
+                * iterating over interior nodes where we'll see keys at
+                * SPOS_MAX:
+                */
+               do {
+                       k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+                       ret = bkey_err(k);
+                       if (!k.k || ret)
+                               break;
+
+                       --btree_nodes;
+                       if (!btree_nodes) {
+                               *end = BBPOS(btree, k.k->p);
+                               bch2_trans_iter_exit(trans, &iter);
+                               return 0;
+                       }
+               } while (bch2_btree_iter_advance(&iter));
+               bch2_trans_iter_exit(trans, &iter);
+       }
+
+       *end = BBPOS_MAX;
+       return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+                                                  struct bpos bucket_start,
+                                                  struct bpos bucket_end)
+{
+       struct btree_iter iter;
+       enum btree_id btree_id;
+       int ret = 0;
+
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+               bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+                                         depth,
+                                         BTREE_ITER_ALL_LEVELS|
+                                         BTREE_ITER_PREFETCH);
+
+               do {
+                       ret = commit_do(trans, NULL, NULL,
+                                       BTREE_INSERT_LAZY_RW|
+                                       BTREE_INSERT_NOFAIL,
+                                       check_extent_to_backpointers(trans, &iter,
+                                                               bucket_start, bucket_end));
+                       if (ret)
+                               break;
+               } while (!bch2_btree_iter_advance(&iter));
+
+               bch2_trans_iter_exit(trans, &iter);
+
+               if (ret)
+                       break;
+
+               ret = commit_do(trans, NULL, NULL,
+                               BTREE_INSERT_LAZY_RW|
+                               BTREE_INSERT_NOFAIL,
+                               check_btree_root_to_backpointers(trans, btree_id,
+                                                       bucket_start, bucket_end));
+               if (ret)
+                       break;
+       }
+       return ret;
+}
+
+int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+                                struct bpos start, struct bpos *end)
+{
+       struct btree_iter alloc_iter;
+       struct btree_iter bp_iter;
+       struct bkey_s_c alloc_k, bp_k;
+       size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+       bool alloc_end = false, bp_end = false;
+       int ret = 0;
+
+       bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                                 start, 0, 1, 0);
+       bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+                                 bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0);
+       while (1) {
+               alloc_k = !alloc_end
+                       ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+                       : bkey_s_c_null;
+               bp_k = !bp_end
+                       ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+                       : bkey_s_c_null;
+
+               ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+               if ((!alloc_k.k && !bp_k.k) || ret) {
+                       *end = SPOS_MAX;
+                       break;
+               }
+
+               --btree_nodes;
+               if (!btree_nodes) {
+                       *end = alloc_k.k->p;
+                       break;
+               }
+
+               if (bpos_cmp(alloc_iter.pos, SPOS_MAX) &&
+                   bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) {
+                       if (!bch2_btree_iter_advance(&alloc_iter))
+                               alloc_end = true;
+               } else {
+                       if (!bch2_btree_iter_advance(&bp_iter))
+                               bp_end = true;
+               }
+       }
+       bch2_trans_iter_exit(trans, &bp_iter);
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct bpos start = POS_MIN, end;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       while (1) {
+               ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+               if (ret)
+                       break;
+
+               if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX))
+                       bch_verbose(c, "check_extents_to_backpointers(): alloc info does not fit in ram,"
+                                   "running in multiple passes with %zu nodes per pass",
+                                   btree_nodes_fit_in_ram(c));
+
+               if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "check_extents_to_backpointers(): ");
+                       bch2_bpos_to_text(&buf, start);
+                       prt_str(&buf, "-");
+                       bch2_bpos_to_text(&buf, end);
+
+                       bch_verbose(c, "%s", buf.buf);
+                       printbuf_exit(&buf);
+               }
+
+               ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+               if (ret || !bpos_cmp(end, SPOS_MAX))
+                       break;
+
+               start = bpos_successor(end);
+       }
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+                                struct bpos bucket,
+                                u64 *bp_offset,
+                                struct bbpos start,
+                                struct bbpos end)
+{
+       struct btree_iter iter;
+       struct bch_backpointer bp;
+       struct bbpos pos;
+       struct bkey_s_c k;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
+       if (ret || *bp_offset == U64_MAX)
+               return ret;
+
+       pos = bp_to_bbpos(bp);
+       if (bbpos_cmp(pos, start) < 0 ||
+           bbpos_cmp(pos, end) > 0)
+               return 0;
+
+       k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+       ret = bkey_err(k);
+       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+               return 0;
+       if (ret)
+               return ret;
+
+       if (fsck_err_on(!k.k, trans->c,
+                       "%s backpointer points to missing extent\n%s",
+                       *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+                       (bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
+               ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+                                                  struct bbpos start,
+                                                  struct bbpos end)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               u64 bp_offset = 0;
+
+               while (!(ret = commit_do(trans, NULL, NULL,
+                                        BTREE_INSERT_LAZY_RW|
+                                        BTREE_INSERT_NOFAIL,
+                               check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
+                      bp_offset < U64_MAX)
+                       bp_offset++;
+
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+       return ret < 0 ? ret : 0;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       while (1) {
+               ret = bch2_get_btree_in_memory_pos(&trans,
+                                                  (1U << BTREE_ID_extents)|
+                                                  (1U << BTREE_ID_reflink),
+                                                  ~0,
+                                                  start, &end);
+               if (ret)
+                       break;
+
+               if (!bbpos_cmp(start, BBPOS_MIN) &&
+                   bbpos_cmp(end, BBPOS_MAX))
+                       bch_verbose(c, "check_backpointers_to_extents(): extents do not fit in ram,"
+                                   "running in multiple passes with %zu nodes per pass",
+                                   btree_nodes_fit_in_ram(c));
+
+               if (bbpos_cmp(start, BBPOS_MIN) ||
+                   bbpos_cmp(end, BBPOS_MAX)) {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "check_backpointers_to_extents(): ");
+                       bch2_bbpos_to_text(&buf, start);
+                       prt_str(&buf, "-");
+                       bch2_bbpos_to_text(&buf, end);
+
+                       bch_verbose(c, "%s", buf.buf);
+                       printbuf_exit(&buf);
+               }
+
+               ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+               if (ret || !bbpos_cmp(end, BBPOS_MAX))
+                       break;
+
+               start = bbpos_successor(end);
+       }
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h
new file mode 100644 (file)
index 0000000..1c97e36
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+                            int, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer (struct bkey_ops) {  \
+       .key_invalid    = bch2_backpointer_invalid,     \
+       .val_to_text    = bch2_backpointer_k_to_text,   \
+       .swab           = bch2_backpointer_swab,        \
+}
+
+void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
+                          struct bkey_s_c, struct extent_ptr_decoded,
+                          struct bpos *, struct bch_backpointer *);
+
+int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
+                               struct bch_backpointer, struct bkey_s_c);
+int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *,
+                               struct bch_backpointer, struct bkey_s_c);
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+                             u64 *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+                                        struct bpos, u64, struct bch_backpointer);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+                                       struct bpos, u64, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/libbcachefs/bbpos.h b/libbcachefs/bbpos.h
new file mode 100644 (file)
index 0000000..1fbed1f
--- /dev/null
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bkey_methods.h"
+
+struct bbpos {
+       enum btree_id           btree;
+       struct bpos             pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+       return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN      BBPOS(0, POS_MIN)
+#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+       return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+       if (bpos_cmp(pos.pos, SPOS_MAX)) {
+               pos.pos = bpos_successor(pos.pos);
+               return pos;
+       }
+
+       if (pos.btree != BTREE_ID_NR) {
+               pos.btree++;
+               pos.pos = POS_MIN;
+               return pos;
+       }
+
+       BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+       prt_str(out, bch2_btree_ids[pos.btree]);
+       prt_char(out, ':');
+       bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
index 0e9689f6878afd3062710d9f6b4bd288bf8eef84..33186fa82682e037af78dcb3543d5bbf3049e26b 100644 (file)
  *
  * BTREE NODES:
  *
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
  * free smaller than a bucket - so, that's how big our btree nodes are.
  *
  * (If buckets are really big we'll only use part of the bucket for a btree node
 #define dynamic_fault(...)             0
 #define race_fault(...)                        0
 
+#define trace_and_count(_c, _name, ...)                                        \
+do {                                                                   \
+       this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]);              \
+       trace_##_name(__VA_ARGS__);                                     \
+} while (0)
+
 #define bch2_fs_init_fault(name)                                       \
        dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)                                     \
         dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)              "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_log_msg(_c, fmt)          "bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt(_c, fmt)              bch2_log_msg(_c, fmt "\n")
 #define bch2_fmt_inum(_c, _inum, fmt)  "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
+#define bch2_log_msg(_c, fmt)          fmt
 #define bch2_fmt(_c, fmt)              fmt "\n"
 #define bch2_fmt_inum(_c, _inum, fmt)  "inum %llu: " fmt "\n", (_inum)
 #endif
@@ -329,9 +337,6 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(btree_interior_update_foreground)     \
        x(btree_interior_update_total)          \
        x(btree_gc)                             \
-       x(btree_lock_contended_read)            \
-       x(btree_lock_contended_intent)          \
-       x(btree_lock_contended_write)           \
        x(data_write)                           \
        x(data_read)                            \
        x(data_promote)                         \
@@ -391,6 +396,10 @@ enum gc_phase {
        GC_PHASE_BTREE_reflink,
        GC_PHASE_BTREE_subvolumes,
        GC_PHASE_BTREE_snapshots,
+       GC_PHASE_BTREE_lru,
+       GC_PHASE_BTREE_freespace,
+       GC_PHASE_BTREE_need_discard,
+       GC_PHASE_BTREE_backpointers,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -447,7 +456,7 @@ struct bch_dev {
         * gc_lock, for device resize - holding any is sufficient for access:
         * Or rcu_read_lock(), but only for ptr_stale():
         */
-       struct bucket_array __rcu *buckets[2];
+       struct bucket_array __rcu *buckets_gc;
        struct bucket_gens __rcu *bucket_gens;
        u8                      *oldest_gen;
        unsigned long           *buckets_nouse;
@@ -459,34 +468,18 @@ struct bch_dev {
 
        /* Allocator: */
        u64                     new_fs_bucket_idx;
-       struct task_struct __rcu *alloc_thread;
+       u64                     bucket_alloc_trans_early_cursor;
 
-       /*
-        * free: Buckets that are ready to be used
-        *
-        * free_inc: Incoming buckets - these are buckets that currently have
-        * cached data in them, and we can't reuse them until after we write
-        * their new gen to disk. After prio_write() finishes writing the new
-        * gens/prios, they'll be moved to the free list (and possibly discarded
-        * in the process)
-        */
-       alloc_fifo              free[RESERVE_NR];
-       alloc_fifo              free_inc;
        unsigned                nr_open_buckets;
+       unsigned                nr_btree_reserve;
 
        open_bucket_idx_t       open_buckets_partial[OPEN_BUCKETS_COUNT];
        open_bucket_idx_t       open_buckets_partial_nr;
 
-       size_t                  fifo_last_bucket;
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
        size_t                  buckets_waiting_on_journal;
 
-       enum allocator_states   allocator_state;
-
-       alloc_heap              alloc_heap;
-
        atomic64_t              rebalance_work;
 
        struct journal_device   journal;
@@ -507,16 +500,8 @@ struct bch_dev {
 
 enum {
        /* startup: */
-       BCH_FS_INITIALIZED,
-       BCH_FS_ALLOC_READ_DONE,
-       BCH_FS_ALLOC_CLEAN,
-       BCH_FS_ALLOCATOR_RUNNING,
-       BCH_FS_ALLOCATOR_STOPPING,
-       BCH_FS_INITIAL_GC_DONE,
-       BCH_FS_INITIAL_GC_UNFIXED,
-       BCH_FS_TOPOLOGY_REPAIR_DONE,
-       BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
+       BCH_FS_MAY_GO_RW,
        BCH_FS_RW,
        BCH_FS_WAS_RW,
 
@@ -524,25 +509,39 @@ enum {
        BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
+       BCH_FS_CLEAN_SHUTDOWN,
+
+       /* fsck passes: */
+       BCH_FS_TOPOLOGY_REPAIR_DONE,
+       BCH_FS_INITIAL_GC_DONE,         /* kill when we enumerate fsck passes */
+       BCH_FS_CHECK_LRUS_DONE,
+       BCH_FS_CHECK_BACKPOINTERS_DONE,
+       BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+       BCH_FS_FSCK_DONE,
+       BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
+       BCH_FS_NEED_ANOTHER_GC,
+
+       BCH_FS_HAVE_DELETED_SNAPSHOTS,
 
        /* errors: */
        BCH_FS_ERROR,
        BCH_FS_TOPOLOGY_ERROR,
        BCH_FS_ERRORS_FIXED,
        BCH_FS_ERRORS_NOT_FIXED,
-
-       /* misc: */
-       BCH_FS_NEED_ANOTHER_GC,
-       BCH_FS_DELETED_NODES,
-       BCH_FS_REBUILD_REPLICAS,
-       BCH_FS_HOLD_BTREE_WRITES,
 };
 
 struct btree_debug {
        unsigned                id;
-       struct dentry           *btree;
-       struct dentry           *btree_format;
-       struct dentry           *failed;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+       struct mutex            lock;
+       struct time_stats       lock_hold_times;
+       unsigned                nr_max_paths;
+       unsigned                max_mem;
+       char                    *max_paths_text;
 };
 
 struct bch_fs_pcpu {
@@ -560,17 +559,22 @@ struct journal_seq_blacklist_table {
 
 struct journal_keys {
        struct journal_key {
+               u64             journal_seq;
+               u32             journal_offset;
                enum btree_id   btree_id:8;
                unsigned        level:8;
                bool            allocated;
                bool            overwritten;
                struct bkey_i   *k;
-               u32             journal_seq;
-               u32             journal_offset;
        }                       *d;
+       /*
+        * Gap buffer: instead of all the empty space in the array being at the
+        * end of the buffer - from @nr to @size - the empty space is at @gap.
+        * This means that sequential insertions are O(n) instead of O(n^2).
+        */
+       size_t                  gap;
        size_t                  nr;
        size_t                  size;
-       u64                     journal_seq_base;
 };
 
 struct btree_path_buf {
@@ -599,6 +603,7 @@ struct bch_fs {
 
        struct list_head        list;
        struct kobject          kobj;
+       struct kobject          counters_kobj;
        struct kobject          internal;
        struct kobject          opts_dir;
        struct kobject          time_stats;
@@ -670,7 +675,7 @@ struct bch_fs {
        struct mutex            snapshot_table_lock;
        struct work_struct      snapshot_delete_work;
        struct work_struct      snapshot_wait_for_pagecache_and_delete_work;
-       struct snapshot_id_list snapshots_unlinked;
+       snapshot_id_list        snapshots_unlinked;
        struct mutex            snapshots_unlinked_lock;
 
        /* BTREE CACHE */
@@ -778,6 +783,8 @@ struct bch_fs {
        unsigned                write_points_nr;
 
        struct buckets_waiting_for_journal buckets_waiting_for_journal;
+       struct work_struct      discard_work;
+       struct work_struct      invalidate_work;
 
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
@@ -807,7 +814,6 @@ struct bch_fs {
        struct mutex            gc_gens_lock;
 
        /* IO PATH */
-       struct semaphore        io_in_flight;
        struct bio_set          bio_read;
        struct bio_set          bio_read_split;
        struct bio_set          bio_write;
@@ -836,6 +842,8 @@ struct bch_fs {
        copygc_heap             copygc_heap;
        struct write_point      copygc_write_point;
        s64                     copygc_wait;
+       bool                    copygc_running;
+       wait_queue_head_t       copygc_running_wq;
 
        /* DATA PROGRESS STATS */
        struct list_head        data_progress_list;
@@ -887,7 +895,8 @@ struct bch_fs {
        struct bch_memquota_type quotas[QTYP_NR];
 
        /* DEBUG JUNK */
-       struct dentry           *debug;
+       struct dentry           *fs_debug_dir;
+       struct dentry           *btree_debug_dir;
        struct btree_debug      btree_debug[BTREE_ID_NR];
        struct btree            *verify_data;
        struct btree_node       *verify_ondisk;
@@ -905,22 +914,23 @@ struct bch_fs {
        mempool_t               btree_bounce_pool;
 
        struct journal          journal;
-       struct list_head        journal_entries;
+       GENRADIX(struct journal_replay *) journal_entries;
+       u64                     journal_entries_base_seq;
        struct journal_keys     journal_keys;
        struct list_head        journal_iters;
 
        u64                     last_bucket_seq_cleanup;
 
-       /* The rest of this all shows up in sysfs */
-       atomic_long_t           read_realloc_races;
-       atomic_long_t           extent_migrate_done;
-       atomic_long_t           extent_migrate_raced;
+       u64                     counters_on_mount[BCH_COUNTER_NR];
+       u64 __percpu            *counters;
 
        unsigned                btree_gc_periodic:1;
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
        struct time_stats       times[BCH_TIME_STAT_NR];
+
+       struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 };
 
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
index 5153f0e420541c1b8a03e4ebf76501a7c87c2b6a..bfcb75a361cb4c1edf231ca0fce8ae235dad2984 100644 (file)
 #include <linux/uuid.h>
 #include "vstructs.h"
 
+#define BITMASK(name, type, field, offset, end)                                \
+static const unsigned  name##_OFFSET = offset;                         \
+static const unsigned  name##_BITS = (end - offset);                   \
+                                                                       \
+static inline __u64 name(const type *k)                                        \
+{                                                                      \
+       return (k->field >> offset) & ~(~0ULL << (end - offset));       \
+}                                                                      \
+                                                                       \
+static inline void SET_##name(type *k, __u64 v)                                \
+{                                                                      \
+       k->field &= ~(~(~0ULL << (end - offset)) << offset);            \
+       k->field |= (v & ~(~0ULL << (end - offset))) << offset;         \
+}
+
 #define LE_BITMASK(_bits, name, type, field, offset, end)              \
 static const unsigned  name##_OFFSET = offset;                         \
 static const unsigned  name##_BITS = (end - offset);                   \
@@ -321,7 +336,7 @@ static inline void bkey_init(struct bkey *k)
  *   number.
  *
  * - WHITEOUT: for hash table btrees
-*/
+ */
 #define BCH_BKEY_TYPES()                               \
        x(deleted,              0)                      \
        x(whiteout,             1)                      \
@@ -347,7 +362,12 @@ static inline void bkey_init(struct bkey *k)
        x(subvolume,            21)                     \
        x(snapshot,             22)                     \
        x(inode_v2,             23)                     \
-       x(alloc_v3,             24)
+       x(alloc_v3,             24)                     \
+       x(set,                  25)                     \
+       x(lru,                  26)                     \
+       x(alloc_v4,             27)                     \
+       x(backpointer,          28)                     \
+       x(inode_v3,             29)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -377,6 +397,10 @@ struct bch_hash_whiteout {
        struct bch_val          v;
 };
 
+struct bch_set {
+       struct bch_val          v;
+};
+
 /* Extents */
 
 /*
@@ -617,8 +641,8 @@ union bch_extent_entry {
 struct bch_btree_ptr {
        struct bch_val          v;
 
-       struct bch_extent_ptr   start[0];
        __u64                   _data[0];
+       struct bch_extent_ptr   start[];
 } __attribute__((packed, aligned(8)));
 
 struct bch_btree_ptr_v2 {
@@ -629,8 +653,8 @@ struct bch_btree_ptr_v2 {
        __le16                  sectors_written;
        __le16                  flags;
        struct bpos             min_key;
-       struct bch_extent_ptr   start[0];
        __u64                   _data[0];
+       struct bch_extent_ptr   start[];
 } __attribute__((packed, aligned(8)));
 
 LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
@@ -638,8 +662,8 @@ LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,       struct bch_btree_ptr_v2, flags, 0, 1);
 struct bch_extent {
        struct bch_val          v;
 
-       union bch_extent_entry  start[0];
        __u64                   _data[0];
+       union bch_extent_entry  start[];
 } __attribute__((packed, aligned(8)));
 
 struct bch_reservation {
@@ -694,6 +718,21 @@ struct bch_inode_v2 {
        __u8                    fields[0];
 } __attribute__((packed, aligned(8)));
 
+struct bch_inode_v3 {
+       struct bch_val          v;
+
+       __le64                  bi_journal_seq;
+       __le64                  bi_hash_seed;
+       __le64                  bi_flags;
+       __le64                  bi_sectors;
+       __le64                  bi_size;
+       __le64                  bi_version;
+       __u8                    fields[0];
+} __attribute__((packed, aligned(8)));
+
+#define INODEv3_FIELDS_START_INITIAL   6
+#define INODEv3_FIELDS_START_CUR       (offsetof(struct bch_inode_v3, fields) / sizeof(u64))
+
 struct bch_inode_generation {
        struct bch_val          v;
 
@@ -705,7 +744,7 @@ struct bch_inode_generation {
  * bi_subvol and bi_parent_subvol are only set for subvolume roots:
  */
 
-#define BCH_INODE_FIELDS()                     \
+#define BCH_INODE_FIELDS_v2()                  \
        x(bi_atime,                     96)     \
        x(bi_ctime,                     96)     \
        x(bi_mtime,                     96)     \
@@ -732,6 +771,31 @@ struct bch_inode_generation {
        x(bi_subvol,                    32)     \
        x(bi_parent_subvol,             32)
 
+#define BCH_INODE_FIELDS_v3()                  \
+       x(bi_atime,                     96)     \
+       x(bi_ctime,                     96)     \
+       x(bi_mtime,                     96)     \
+       x(bi_otime,                     96)     \
+       x(bi_uid,                       32)     \
+       x(bi_gid,                       32)     \
+       x(bi_nlink,                     32)     \
+       x(bi_generation,                32)     \
+       x(bi_dev,                       32)     \
+       x(bi_data_checksum,             8)      \
+       x(bi_compression,               8)      \
+       x(bi_project,                   32)     \
+       x(bi_background_compression,    8)      \
+       x(bi_data_replicas,             8)      \
+       x(bi_promote_target,            16)     \
+       x(bi_foreground_target,         16)     \
+       x(bi_background_target,         16)     \
+       x(bi_erasure_code,              16)     \
+       x(bi_fields_set,                16)     \
+       x(bi_dir,                       64)     \
+       x(bi_dir_offset,                64)     \
+       x(bi_subvol,                    32)     \
+       x(bi_parent_subvol,             32)
+
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
        x(data_checksum,                8)      \
@@ -757,16 +821,16 @@ enum {
         * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
         * flags)
         */
-       __BCH_INODE_SYNC        = 0,
-       __BCH_INODE_IMMUTABLE   = 1,
-       __BCH_INODE_APPEND      = 2,
-       __BCH_INODE_NODUMP      = 3,
-       __BCH_INODE_NOATIME     = 4,
+       __BCH_INODE_SYNC                = 0,
+       __BCH_INODE_IMMUTABLE           = 1,
+       __BCH_INODE_APPEND              = 2,
+       __BCH_INODE_NODUMP              = 3,
+       __BCH_INODE_NOATIME             = 4,
 
-       __BCH_INODE_I_SIZE_DIRTY= 5,
-       __BCH_INODE_I_SECTORS_DIRTY= 6,
-       __BCH_INODE_UNLINKED    = 7,
-       __BCH_INODE_BACKPTR_UNTRUSTED = 8,
+       __BCH_INODE_I_SIZE_DIRTY        = 5,
+       __BCH_INODE_I_SECTORS_DIRTY     = 6,
+       __BCH_INODE_UNLINKED            = 7,
+       __BCH_INODE_BACKPTR_UNTRUSTED   = 8,
 
        /* bits 20+ reserved for packed fields below: */
 };
@@ -788,6 +852,13 @@ LE32_BITMASK(INODE_NEW_VARINT,     struct bch_inode, bi_flags, 31, 32);
 LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
 LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
 
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,        struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+                               struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,     struct bch_inode_v3, bi_flags, 36, 52);
+
 /* Dirents */
 
 /*
@@ -825,10 +896,9 @@ struct bch_dirent {
 #define DT_SUBVOL      16
 #define BCH_DT_MAX     17
 
-#define BCH_NAME_MAX   (U8_MAX * sizeof(u64) -                         \
+#define BCH_NAME_MAX   ((unsigned) (U8_MAX * sizeof(u64) -             \
                         sizeof(struct bkey) -                          \
-                        offsetof(struct bch_dirent, d_name))
-
+                        offsetof(struct bch_dirent, d_name)))
 
 /* Xattrs */
 
@@ -865,6 +935,12 @@ struct bch_alloc {
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+       BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
 struct bch_alloc_v2 {
        struct bch_val          v;
        __u8                    nr_fields;
@@ -877,8 +953,8 @@ struct bch_alloc_v2 {
 #define BCH_ALLOC_FIELDS_V2()                  \
        x(read_time,            64)             \
        x(write_time,           64)             \
-       x(dirty_sectors,        16)             \
-       x(cached_sectors,       16)             \
+       x(dirty_sectors,        32)             \
+       x(cached_sectors,       32)             \
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
@@ -893,12 +969,43 @@ struct bch_alloc_v3 {
        __u8                    data[];
 } __attribute__((packed, aligned(8)));
 
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-       BCH_ALLOC_FIELDS_V1()
-#undef x
-       BCH_ALLOC_FIELD_NR
-};
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+struct bch_alloc_v4 {
+       struct bch_val          v;
+       __u64                   journal_seq;
+       __u32                   flags;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    stripe_redundancy;
+       __u32                   dirty_sectors;
+       __u32                   cached_sectors;
+       __u64                   io_time[2];
+       __u32                   stripe;
+       __u32                   nr_external_backpointers;
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_V4_U64s_V0   6
+#define BCH_ALLOC_V4_U64s      (sizeof(struct bch_alloc_v4) / sizeof(u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,     struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,     struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,  struct bch_alloc_v4, flags,  8,  14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX       40
+
+struct bch_backpointer {
+       struct bch_val          v;
+       __u8                    btree_id;
+       __u8                    level;
+       __u8                    data_type;
+       __u64                   bucket_offset:40;
+       __u32                   bucket_len;
+       struct bpos             pos;
+} __attribute__((packed, aligned(8)));
 
 /* Quotas: */
 
@@ -938,7 +1045,7 @@ struct bch_stripe {
        __u8                    csum_type;
        __u8                    pad;
 
-       struct bch_extent_ptr   ptrs[0];
+       struct bch_extent_ptr   ptrs[];
 } __attribute__((packed, aligned(8)));
 
 /* Reflink: */
@@ -1015,6 +1122,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED,       struct bch_snapshot, flags,  0,  1)
 /* True if a subvolume points to this snapshot node: */
 LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
 
+/* LRU btree: */
+
+struct bch_lru {
+       struct bch_val          v;
+       __le64                  idx;
+} __attribute__((packed, aligned(8)));
+
+#define LRU_ID_STRIPES         (1U << 16)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1023,16 +1139,18 @@ struct bch_sb_field {
        __le32                  type;
 };
 
-#define BCH_SB_FIELDS()                \
-       x(journal,      0)      \
-       x(members,      1)      \
-       x(crypt,        2)      \
-       x(replicas_v0,  3)      \
-       x(quota,        4)      \
-       x(disk_groups,  5)      \
-       x(clean,        6)      \
-       x(replicas,     7)      \
-       x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS()                                \
+       x(journal,      0)                      \
+       x(members,      1)                      \
+       x(crypt,        2)                      \
+       x(replicas_v0,  3)                      \
+       x(quota,        4)                      \
+       x(disk_groups,  5)                      \
+       x(clean,        6)                      \
+       x(replicas,     7)                      \
+       x(journal_seq_blacklist, 8)             \
+       x(journal_v2,   9)                      \
+       x(counters,     10)
 
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -1041,6 +1159,14 @@ enum bch_sb_field_type {
        BCH_SB_FIELD_NR
 };
 
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS            \
+       ((1U << BCH_SB_FIELD_journal)|          \
+        (1U << BCH_SB_FIELD_journal_v2))
+
 /* BCH_SB_FIELD_journal: */
 
 struct bch_sb_field_journal {
@@ -1048,6 +1174,15 @@ struct bch_sb_field_journal {
        __le64                  buckets[0];
 };
 
+struct bch_sb_field_journal_v2 {
+       struct bch_sb_field     field;
+
+       struct bch_sb_field_journal_v2_entry {
+               __le64          start;
+               __le64          nr;
+       }                       d[0];
+};
+
 /* BCH_SB_FIELD_members: */
 
 #define BCH_MIN_NR_NBUCKETS    (1 << 6)
@@ -1069,6 +1204,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD,  struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
 LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+                                       struct bch_member, flags[0], 30, 31)
 
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
@@ -1144,13 +1281,16 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,  struct bch_sb_field_crypt, kdf_flags, 32, 48);
 /* BCH_SB_FIELD_replicas: */
 
 #define BCH_DATA_TYPES()               \
-       x(none,         0)              \
+       x(free,         0)              \
        x(sb,           1)              \
        x(journal,      2)              \
        x(btree,        3)              \
        x(user,         4)              \
        x(cached,       5)              \
-       x(parity,       6)
+       x(parity,       6)              \
+       x(stripe,       7)              \
+       x(need_gc_gens, 8)              \
+       x(need_discard, 9)
 
 enum bch_data_type {
 #define x(t, n) BCH_DATA_##t,
@@ -1159,22 +1299,45 @@ enum bch_data_type {
        BCH_DATA_NR
 };
 
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+       switch (type) {
+       case BCH_DATA_free:
+       case BCH_DATA_need_gc_gens:
+       case BCH_DATA_need_discard:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+       switch (type) {
+       case BCH_DATA_sb:
+       case BCH_DATA_journal:
+               return true;
+       default:
+               return false;
+       }
+}
+
 struct bch_replicas_entry_v0 {
        __u8                    data_type;
        __u8                    nr_devs;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __attribute__((packed));
 
 struct bch_sb_field_replicas_v0 {
        struct bch_sb_field     field;
-       struct bch_replicas_entry_v0 entries[0];
+       struct bch_replicas_entry_v0 entries[];
 } __attribute__((packed, aligned(8)));
 
 struct bch_replicas_entry {
        __u8                    data_type;
        __u8                    nr_devs;
        __u8                    nr_required;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __attribute__((packed));
 
 #define replicas_entry_bytes(_i)                                       \
@@ -1220,6 +1383,97 @@ struct bch_sb_field_disk_groups {
        struct bch_disk_group   entries[0];
 } __attribute__((packed, aligned(8)));
 
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS()                              \
+       x(io_read,                                      0)      \
+       x(io_write,                                     1)      \
+       x(io_move,                                      2)      \
+       x(bucket_invalidate,                            3)      \
+       x(bucket_discard,                               4)      \
+       x(bucket_alloc,                                 5)      \
+       x(bucket_alloc_fail,                            6)      \
+       x(btree_cache_scan,                             7)      \
+       x(btree_cache_reap,                             8)      \
+       x(btree_cache_cannibalize,                      9)      \
+       x(btree_cache_cannibalize_lock,                 10)     \
+       x(btree_cache_cannibalize_lock_fail,            11)     \
+       x(btree_cache_cannibalize_unlock,               12)     \
+       x(btree_node_write,                             13)     \
+       x(btree_node_read,                              14)     \
+       x(btree_node_compact,                           15)     \
+       x(btree_node_merge,                             16)     \
+       x(btree_node_split,                             17)     \
+       x(btree_node_rewrite,                           18)     \
+       x(btree_node_alloc,                             19)     \
+       x(btree_node_free,                              20)     \
+       x(btree_node_set_root,                          21)     \
+       x(btree_path_relock_fail,                       22)     \
+       x(btree_path_upgrade_fail,                      23)     \
+       x(btree_reserve_get_fail,                       24)     \
+       x(journal_entry_full,                           25)     \
+       x(journal_full,                                 26)     \
+       x(journal_reclaim_finish,                       27)     \
+       x(journal_reclaim_start,                        28)     \
+       x(journal_write,                                29)     \
+       x(read_promote,                                 30)     \
+       x(read_bounce,                                  31)     \
+       x(read_split,                                   33)     \
+       x(read_retry,                                   32)     \
+       x(read_reuse_race,                              34)     \
+       x(move_extent_read,                             35)     \
+       x(move_extent_write,                            36)     \
+       x(move_extent_finish,                           37)     \
+       x(move_extent_race,                             38)     \
+       x(move_extent_alloc_mem_fail,                   39)     \
+       x(copygc,                                       40)     \
+       x(copygc_wait,                                  41)     \
+       x(gc_gens_end,                                  42)     \
+       x(gc_gens_start,                                43)     \
+       x(trans_blocked_journal_reclaim,                44)     \
+       x(trans_restart_btree_node_reused,              45)     \
+       x(trans_restart_btree_node_split,               46)     \
+       x(trans_restart_fault_inject,                   47)     \
+       x(trans_restart_iter_upgrade,                   48)     \
+       x(trans_restart_journal_preres_get,             49)     \
+       x(trans_restart_journal_reclaim,                50)     \
+       x(trans_restart_journal_res_get,                51)     \
+       x(trans_restart_key_cache_key_realloced,        52)     \
+       x(trans_restart_key_cache_raced,                53)     \
+       x(trans_restart_mark_replicas,                  54)     \
+       x(trans_restart_mem_realloced,                  55)     \
+       x(trans_restart_memory_allocation_failure,      56)     \
+       x(trans_restart_relock,                         57)     \
+       x(trans_restart_relock_after_fill,              58)     \
+       x(trans_restart_relock_key_cache_fill,          59)     \
+       x(trans_restart_relock_next_node,               60)     \
+       x(trans_restart_relock_parent_for_fill,         61)     \
+       x(trans_restart_relock_path,                    62)     \
+       x(trans_restart_relock_path_intent,             63)     \
+       x(trans_restart_too_many_iters,                 64)     \
+       x(trans_restart_traverse,                       65)     \
+       x(trans_restart_upgrade,                        66)     \
+       x(trans_restart_would_deadlock,                 67)     \
+       x(trans_restart_would_deadlock_write,           68)     \
+       x(trans_restart_injected,                       69)     \
+       x(trans_restart_key_cache_upgrade,              70)     \
+       x(trans_traverse_all,                           71)     \
+       x(transaction_commit,                           72)     \
+       x(write_super,                                  73)     \
+       x(trans_restart_would_deadlock_recursion_limit, 74)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+       struct bch_sb_field     field;
+       __le64                  d[0];
+};
+
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
@@ -1275,19 +1529,28 @@ struct bch_sb_field_journal_seq_blacklist {
 #define BCH_JSET_VERSION_OLD                   2
 #define BCH_BSET_VERSION_OLD                   3
 
+#define BCH_METADATA_VERSIONS()                                \
+       x(bkey_renumber,                10)             \
+       x(inode_btree_change,           11)             \
+       x(snapshot,                     12)             \
+       x(inode_backpointers,           13)             \
+       x(btree_ptr_sectors_written,    14)             \
+       x(snapshot_2,                   15)             \
+       x(reflink_p_fix,                16)             \
+       x(subvol_dirent,                17)             \
+       x(inode_v2,                     18)             \
+       x(freespace,                    19)             \
+       x(alloc_v4,                     20)             \
+       x(new_data_types,               21)             \
+       x(backpointers,                 22)             \
+       x(inode_v3,                     23)
+
 enum bcachefs_metadata_version {
-       bcachefs_metadata_version_min                   = 9,
-       bcachefs_metadata_version_new_versioning        = 10,
-       bcachefs_metadata_version_bkey_renumber         = 10,
-       bcachefs_metadata_version_inode_btree_change    = 11,
-       bcachefs_metadata_version_snapshot              = 12,
-       bcachefs_metadata_version_inode_backpointers    = 13,
-       bcachefs_metadata_version_btree_ptr_sectors_written = 14,
-       bcachefs_metadata_version_snapshot_2            = 15,
-       bcachefs_metadata_version_reflink_p_fix         = 16,
-       bcachefs_metadata_version_subvol_dirent         = 17,
-       bcachefs_metadata_version_inode_v2              = 18,
-       bcachefs_metadata_version_max                   = 19,
+       bcachefs_metadata_version_min = 9,
+#define x(t, n)        bcachefs_metadata_version_##t = n,
+       BCH_METADATA_VERSIONS()
+#undef x
+       bcachefs_metadata_version_max
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1427,6 +1690,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+/* Obsolete, always enabled: */
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 
 /*
@@ -1663,7 +1927,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(data_usage,           6)              \
        x(clock,                7)              \
        x(dev_usage,            8)              \
-       x(log,                  9)
+       x(log,                  9)              \
+       x(overwrite,            10)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1735,7 +2000,7 @@ struct jset_entry_dev_usage {
        __u32                   pad;
 
        __le64                  buckets_ec;
-       __le64                  buckets_unavailable;
+       __le64                  _buckets_unavailable; /* No longer used */
 
        struct jset_entry_dev_usage_type d[];
 } __attribute__((packed));
@@ -1804,7 +2069,11 @@ LE32_BITMASK(JSET_NO_FLUSH,      struct jset, flags, 5, 6);
        x(stripes,      6)                      \
        x(reflink,      7)                      \
        x(subvolumes,   8)                      \
-       x(snapshots,    9)
+       x(snapshots,    9)                      \
+       x(lru,          10)                     \
+       x(freespace,    11)                     \
+       x(need_discard, 12)                     \
+       x(backpointers, 13)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
index 930981ad55355a2ad64eea94681fc6010b50fbb7..b2edabf58260d4ea1e312aae44cfee68b25fa810 100644 (file)
@@ -285,13 +285,14 @@ struct bch_ioctl_dev_usage {
 
        __u32                   bucket_size;
        __u64                   nr_buckets;
-       __u64                   available_buckets;
 
-       __u64                   buckets[BCH_DATA_NR];
-       __u64                   sectors[BCH_DATA_NR];
+       __u64                   buckets_ec;
 
-       __u64                   ec_buckets;
-       __u64                   ec_sectors;
+       struct bch_ioctl_dev_usage_type {
+               __u64           buckets;
+               __u64           sectors;
+               __u64           fragmented;
+       }                       d[BCH_DATA_NR];
 };
 
 /*
index 946dd27f09fca86502a94c377020e234b2caa54b..f7e5d0c377eb0cc6443244de8110c2d6423c752f 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_cmp.h"
 #include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"
@@ -19,33 +20,49 @@ const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
                              const struct bkey_packed *);
 
-void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+                                    const struct bkey_format *f,
+                                    const struct bkey_packed *k)
 {
-       unsigned bit = high_bit_offset, done = 0;
+       const u64 *p = high_word(f, k);
+       unsigned word_bits = 64 - high_bit_offset;
+       unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+       u64 v = *p & (~0ULL >> high_bit_offset);
+
+       if (!nr_key_bits) {
+               prt_str(out, "(empty)");
+               return;
+       }
 
        while (1) {
-               while (bit < 64) {
-                       if (done && !(done % 8))
-                               *out++ = ' ';
-                       *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
-                       bit++;
-                       done++;
-                       if (done == nr_bits) {
-                               *out++ = '\0';
-                               return;
-                       }
+               unsigned next_key_bits = nr_key_bits;
+
+               if (nr_key_bits < 64) {
+                       v >>= 64 - nr_key_bits;
+                       next_key_bits = 0;
+               } else {
+                       next_key_bits -= 64;
                }
 
+               bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+               if (!next_key_bits)
+                       break;
+
+               prt_char(out, ' ');
+
                p = next_word(p);
-               bit = 0;
+               v = *p;
+               word_bits = 64;
+               nr_key_bits = next_key_bits;
        }
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
 static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-                                const struct bkey *unpacked,
-                                const struct bkey_format *format)
+                                 const struct bkey *unpacked,
+                                 const struct bkey_format *format)
 {
        struct bkey tmp;
 
@@ -57,22 +74,35 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
        tmp = __bch2_bkey_unpack_key(format, packed);
 
        if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-               char buf1[160], buf2[160];
-               char buf3[160], buf4[160];
-
-               bch2_bkey_to_text(&PBUF(buf1), unpacked);
-               bch2_bkey_to_text(&PBUF(buf2), &tmp);
-               bch2_to_binary(buf3, (void *) unpacked, 80);
-               bch2_to_binary(buf4, high_word(format, packed), 80);
+               struct printbuf buf = PRINTBUF;
 
-               panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+               prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
                      format->key_u64s,
                      format->bits_per_field[0],
                      format->bits_per_field[1],
                      format->bits_per_field[2],
                      format->bits_per_field[3],
-                     format->bits_per_field[4],
-                     buf1, buf2, buf3, buf4);
+                     format->bits_per_field[4]);
+
+               prt_printf(&buf, "compiled unpack: ");
+               bch2_bkey_to_text(&buf, unpacked);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "c unpack:        ");
+               bch2_bkey_to_text(&buf, &tmp);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "compiled unpack: ");
+               bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+                                               (struct bkey_packed *) unpacked);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "c unpack:        ");
+               bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+                                               (struct bkey_packed *) &tmp);
+               prt_newline(&buf);
+
+               panic("%s", buf.buf);
        }
 }
 
@@ -201,9 +231,10 @@ static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
 {
        struct pack_state out_s = pack_state_init(out_f, out);
        struct unpack_state in_s = unpack_state_init(in_f, in);
+       u64 *w = out->_data;
        unsigned i;
 
-       out->_data[0] = 0;
+       *w = 0;
 
        for (i = 0; i < BKEY_NR_FIELDS; i++)
                if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
@@ -292,12 +323,13 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
                   const struct bkey_format *format)
 {
        struct pack_state state = pack_state_init(format, out);
+       u64 *w = out->_data;
 
        EBUG_ON((void *) in == (void *) out);
        EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
        EBUG_ON(in->format != KEY_FORMAT_CURRENT);
 
-       out->_data[0] = 0;
+       *w = 0;
 
 #define x(id, field)   if (!set_inc_field(&state, id, in->field)) return false;
        bkey_fields()
@@ -439,6 +471,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 {
        const struct bkey_format *f = &b->format;
        struct pack_state state = pack_state_init(f, out);
+       u64 *w = out->_data;
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct bpos orig = in;
 #endif
@@ -451,7 +484,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
         * enough - we need to make sure to zero them out:
         */
        for (i = 0; i < f->key_u64s; i++)
-               out->_data[i] = 0;
+               w[i] = 0;
 
        if (unlikely(in.snapshot <
                     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
@@ -731,50 +764,6 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
 
 #ifdef CONFIG_X86_64
 
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-                                 unsigned nr_key_bits)
-{
-       long d0, d1, d2, d3;
-       int cmp;
-
-       /* we shouldn't need asm for this, but gcc is being retarded: */
-
-       asm(".intel_syntax noprefix;"
-           "xor eax, eax;"
-           "xor edx, edx;"
-           "1:;"
-           "mov r8, [rdi];"
-           "mov r9, [rsi];"
-           "sub ecx, 64;"
-           "jl 2f;"
-
-           "cmp r8, r9;"
-           "jnz 3f;"
-
-           "lea rdi, [rdi - 8];"
-           "lea rsi, [rsi - 8];"
-           "jmp 1b;"
-
-           "2:;"
-           "not ecx;"
-           "shr r8, 1;"
-           "shr r9, 1;"
-           "shr r8, cl;"
-           "shr r9, cl;"
-           "cmp r8, r9;"
-
-           "3:\n"
-           "seta al;"
-           "setb dl;"
-           "sub eax, edx;"
-           ".att_syntax prefix;"
-           : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-           : "0" (l), "1" (r), "3" (nr_key_bits)
-           : "r8", "r9", "cc", "memory");
-
-       return cmp;
-}
-
 #define I(_x)                  (*(out)++ = (_x))
 #define I1(i0)                                         I(i0)
 #define I2(i0, i1)             (I1(i0),                I(i1))
@@ -1005,40 +994,6 @@ int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
 }
 
 #else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-                                 unsigned nr_key_bits)
-{
-       u64 l_v, r_v;
-
-       if (!nr_key_bits)
-               return 0;
-
-       /* for big endian, skip past header */
-       nr_key_bits += high_bit_offset;
-       l_v = *l & (~0ULL >> high_bit_offset);
-       r_v = *r & (~0ULL >> high_bit_offset);
-
-       while (1) {
-               if (nr_key_bits < 64) {
-                       l_v >>= 64 - nr_key_bits;
-                       r_v >>= 64 - nr_key_bits;
-                       nr_key_bits = 0;
-               } else {
-                       nr_key_bits -= 64;
-               }
-
-               if (!nr_key_bits || l_v != r_v)
-                       break;
-
-               l = next_word(l);
-               r = next_word(r);
-
-               l_v = *l;
-               r_v = *r;
-       }
-
-       return cmp_int(l_v, r_v);
-}
 #endif
 
 __pure
@@ -1046,19 +1001,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
                                          const struct bkey_packed *r,
                                          const struct btree *b)
 {
-       const struct bkey_format *f = &b->format;
-       int ret;
-
-       EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-       ret = __bkey_cmp_bits(high_word(f, l),
-                             high_word(f, r),
-                             b->nr_key_bits);
-
-       EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-                               bkey_unpack_pos(b, r)));
-       return ret;
+       return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
 }
 
 __pure __flatten
@@ -1074,20 +1017,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
                         const struct bkey_packed *l,
                         const struct bkey_packed *r)
 {
-       struct bkey unpacked;
-
-       if (likely(bkey_packed(l) && bkey_packed(r)))
-               return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-
-       if (bkey_packed(l)) {
-               __bkey_unpack_key_format_checked(b, &unpacked, l);
-               l = (void*) &unpacked;
-       } else if (bkey_packed(r)) {
-               __bkey_unpack_key_format_checked(b, &unpacked, r);
-               r = (void*) &unpacked;
-       }
-
-       return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+       return bch2_bkey_cmp_packed_inlined(b, l, r);
 }
 
 __pure __flatten
index 7dee3d8e0a3d169160fab7018c6fe1ef55660eb5..19b59ffe0a98fbde8828feb9be1d76641a13e9ab 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/bug.h>
 #include "bcachefs_format.h"
 
+#include "btree_types.h"
 #include "util.h"
 #include "vstructs.h"
 
@@ -12,7 +13,9 @@
 #define HAVE_BCACHEFS_COMPILED_UNPACK  1
 #endif
 
-void bch2_to_binary(char *, const u64 *, unsigned);
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+                                    const struct bkey_format *,
+                                    const struct bkey_packed *);
 
 /* bkey with split value, const */
 struct bkey_s_c {
@@ -42,12 +45,15 @@ static inline size_t bkey_val_bytes(const struct bkey *k)
 
 static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
 {
-       k->u64s = BKEY_U64s + val_u64s;
+       unsigned u64s = BKEY_U64s + val_u64s;
+
+       BUG_ON(u64s > U8_MAX);
+       k->u64s = u64s;
 }
 
 static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 {
-       k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+       set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
 }
 
 #define bkey_val_end(_k)       ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
@@ -129,8 +135,9 @@ int bkey_cmp_left_packed(const struct btree *b,
 }
 
 /*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
  */
 __pure __flatten
 static inline int bkey_cmp_left_packed_byval(const struct btree *b,
@@ -351,6 +358,99 @@ void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
 bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
               const struct bkey_format *);
 
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+                              struct bkey *dst,
+                              const struct bkey_packed *src)
+{
+       if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+               compiled_unpack_fn unpack_fn = b->aux_data;
+               unpack_fn(dst, src);
+
+               if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+                   bch2_expensive_debug_checks) {
+                       struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+                       BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+               }
+       } else {
+               *dst = __bch2_bkey_unpack_key(&b->format, src);
+       }
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+                              const struct bkey_packed *src)
+{
+       struct bkey dst;
+
+       __bkey_unpack_key_format_checked(b, &dst, src);
+       return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+                                    struct bkey *dst,
+                                    const struct bkey_packed *src)
+{
+       if (likely(bkey_packed(src)))
+               __bkey_unpack_key_format_checked(b, dst, src);
+       else
+               *dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+                                         const struct bkey_packed *src)
+{
+       return likely(bkey_packed(src))
+               ? bkey_unpack_key_format_checked(b, src)
+               : *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+                              const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+       return bkey_unpack_key_format_checked(b, src).p;
+#else
+       return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+                                         const struct bkey_packed *src)
+{
+       return likely(bkey_packed(src))
+               ? bkey_unpack_pos_format_checked(b, src)
+               : packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+                                              const struct bkey_packed *k,
+                                              struct bkey *u)
+{
+       __bkey_unpack_key(b, u, k);
+
+       return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+                                              struct bkey_packed *k,
+                                              struct bkey *u)
+{
+       __bkey_unpack_key(b, u, k);
+
+       return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
 static inline u64 bkey_field_max(const struct bkey_format *f,
                                 enum bch_bkey_fields nr)
 {
index 0d7c67a959af14baf8efa98be0ea04150f9137ed..a30c4ae8eb369db29c3a5d1333b418b5972efbac 100644 (file)
@@ -3,6 +3,7 @@
 #define _BCACHEFS_BKEY_BUF_H
 
 #include "bcachefs.h"
+#include "bkey.h"
 
 struct bkey_buf {
        struct bkey_i   *k;
diff --git a/libbcachefs/bkey_cmp.h b/libbcachefs/bkey_cmp.h
new file mode 100644 (file)
index 0000000..5f42a6e
--- /dev/null
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+                                 unsigned nr_key_bits)
+{
+       long d0, d1, d2, d3;
+       int cmp;
+
+       /* we shouldn't need asm for this, but gcc is being retarded: */
+
+       asm(".intel_syntax noprefix;"
+           "xor eax, eax;"
+           "xor edx, edx;"
+           "1:;"
+           "mov r8, [rdi];"
+           "mov r9, [rsi];"
+           "sub ecx, 64;"
+           "jl 2f;"
+
+           "cmp r8, r9;"
+           "jnz 3f;"
+
+           "lea rdi, [rdi - 8];"
+           "lea rsi, [rsi - 8];"
+           "jmp 1b;"
+
+           "2:;"
+           "not ecx;"
+           "shr r8, 1;"
+           "shr r9, 1;"
+           "shr r8, cl;"
+           "shr r9, cl;"
+           "cmp r8, r9;"
+
+           "3:\n"
+           "seta al;"
+           "setb dl;"
+           "sub eax, edx;"
+           ".att_syntax prefix;"
+           : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+           : "0" (l), "1" (r), "3" (nr_key_bits)
+           : "r8", "r9", "cc", "memory");
+
+       return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+                                 unsigned nr_key_bits)
+{
+       u64 l_v, r_v;
+
+       if (!nr_key_bits)
+               return 0;
+
+       /* for big endian, skip past header */
+       nr_key_bits += high_bit_offset;
+       l_v = *l & (~0ULL >> high_bit_offset);
+       r_v = *r & (~0ULL >> high_bit_offset);
+
+       while (1) {
+               if (nr_key_bits < 64) {
+                       l_v >>= 64 - nr_key_bits;
+                       r_v >>= 64 - nr_key_bits;
+                       nr_key_bits = 0;
+               } else {
+                       nr_key_bits -= 64;
+               }
+
+               if (!nr_key_bits || l_v != r_v)
+                       break;
+
+               l = next_word(l);
+               r = next_word(r);
+
+               l_v = *l;
+               r_v = *r;
+       }
+
+       return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+                                         const struct bkey_packed *r,
+                                         const struct btree *b)
+{
+       const struct bkey_format *f = &b->format;
+       int ret;
+
+       EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+       ret = __bkey_cmp_bits(high_word(f, l),
+                             high_word(f, r),
+                             b->nr_key_bits);
+
+       EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+                               bkey_unpack_pos(b, r)));
+       return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+                        const struct bkey_packed *l,
+                        const struct bkey_packed *r)
+{
+       struct bkey unpacked;
+
+       if (likely(bkey_packed(l) && bkey_packed(r)))
+               return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+       if (bkey_packed(l)) {
+               __bkey_unpack_key_format_checked(b, &unpacked, l);
+               l = (void *) &unpacked;
+       } else if (bkey_packed(r)) {
+               __bkey_unpack_key_format_checked(b, &unpacked, r);
+               r = (void *) &unpacked;
+       }
+
+       return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
index e83aeb683a0977c84f82b0e4559c1ccc1e2d3194..14d910a3077ffa49a6522423563bc1de0df632fc 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
 #include "alloc_background.h"
@@ -9,6 +10,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "lru.h"
 #include "quota.h"
 #include "reflink.h"
 #include "subvolume.h"
@@ -21,10 +23,10 @@ const char * const bch2_bkey_types[] = {
        NULL
 };
 
-static const char *deleted_key_invalid(const struct bch_fs *c,
-                                       struct bkey_s_c k)
+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                              int rw, struct printbuf *err)
 {
-       return NULL;
+       return 0;
 }
 
 #define bch2_bkey_ops_deleted (struct bkey_ops) {      \
@@ -35,25 +37,32 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
        .key_invalid = deleted_key_invalid,             \
 }
 
-static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                                int rw, struct printbuf *err)
 {
-       if (bkey_val_bytes(k.k))
-               return "value size should be zero";
+       if (bkey_val_bytes(k.k)) {
+               prt_printf(err, "incorrect value size (%zu != 0)",
+                      bkey_val_bytes(k.k));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 #define bch2_bkey_ops_error (struct bkey_ops) {                \
        .key_invalid = empty_val_key_invalid,           \
 }
 
-static const char *key_type_cookie_invalid(const struct bch_fs *c,
-                                          struct bkey_s_c k)
+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                                  int rw, struct printbuf *err)
 {
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
-               return "incorrect value size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 #define bch2_bkey_ops_cookie (struct bkey_ops) {       \
@@ -64,10 +73,10 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
        .key_invalid = empty_val_key_invalid,           \
 }
 
-static const char *key_type_inline_data_invalid(const struct bch_fs *c,
-                                          struct bkey_s_c k)
+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                                       int rw, struct printbuf *err)
 {
-       return NULL;
+       return 0;
 }
 
 static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
@@ -76,7 +85,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
        struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
        unsigned datalen = bkey_inline_data_bytes(k.k);
 
-       pr_buf(out, "datalen %u: %*phN",
+       prt_printf(out, "datalen %u: %*phN",
               datalen, min(datalen, 32U), d.v->data);
 }
 
@@ -85,18 +94,44 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
        .val_to_text    = key_type_inline_data_to_text, \
 }
 
+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                               int rw, struct printbuf *err)
+{
+       if (bkey_val_bytes(k.k)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+       bch2_key_resize(l.k, l.k->size + r.k->size);
+       return true;
+}
+
+#define bch2_bkey_ops_set (struct bkey_ops) {          \
+       .key_invalid    = key_type_set_invalid,         \
+       .key_merge      = key_type_set_merge,           \
+}
+
 const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]  = bch2_bkey_ops_##name,
        BCH_BKEY_TYPES()
 #undef x
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
-       if (k.k->type >= KEY_TYPE_MAX)
-               return "invalid type";
+       if (k.k->type >= KEY_TYPE_MAX) {
+               prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
+               return -EINVAL;
+       }
 
-       return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+       return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
@@ -114,6 +149,7 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_inode)|
                (1U << KEY_TYPE_inode_v2)|
+               (1U << KEY_TYPE_inode_v3)|
                (1U << KEY_TYPE_inode_generation),
        [BKEY_TYPE_dirents] =
                (1U << KEY_TYPE_deleted)|
@@ -130,7 +166,8 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_alloc)|
                (1U << KEY_TYPE_alloc_v2)|
-               (1U << KEY_TYPE_alloc_v3),
+               (1U << KEY_TYPE_alloc_v3)|
+               (1U << KEY_TYPE_alloc_v4),
        [BKEY_TYPE_quotas] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_quota),
@@ -147,112 +184,145 @@ static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_snapshots] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_snapshot),
+       [BKEY_TYPE_lru] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_lru),
+       [BKEY_TYPE_freespace] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_set),
+       [BKEY_TYPE_need_discard] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_set),
+       [BKEY_TYPE_backpointers] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_backpointer),
        [BKEY_TYPE_btree] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_btree_ptr)|
                (1U << KEY_TYPE_btree_ptr_v2),
 };
 
-const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-                               enum btree_node_type type)
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+                       enum btree_node_type type,
+                       int rw, struct printbuf *err)
 {
-       if (k.k->u64s < BKEY_U64s)
-               return "u64s too small";
-
-       if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
-               return "invalid key type for this btree";
+       if (k.k->u64s < BKEY_U64s) {
+               prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+               return -EINVAL;
+       }
 
-       if (type == BKEY_TYPE_btree &&
-           bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-               return "value too big";
+       if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+               prt_printf(err, "invalid key type for btree %s (%s)",
+                          bch2_btree_ids[type], bch2_bkey_types[type]);
+               return -EINVAL;
+       }
 
        if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-               if (k.k->size == 0)
-                       return "bad size field";
+               if (k.k->size == 0) {
+                       prt_printf(err, "size == 0");
+                       return -EINVAL;
+               }
 
-               if (k.k->size > k.k->p.offset)
-                       return "size greater than offset";
+               if (k.k->size > k.k->p.offset) {
+                       prt_printf(err, "size greater than offset (%u > %llu)",
+                              k.k->size, k.k->p.offset);
+                       return -EINVAL;
+               }
        } else {
-               if (k.k->size)
-                       return "nonzero size field";
+               if (k.k->size) {
+                       prt_printf(err, "size != 0");
+                       return -EINVAL;
+               }
        }
 
        if (type != BKEY_TYPE_btree &&
            !btree_type_has_snapshots(type) &&
-           k.k->p.snapshot)
-               return "nonzero snapshot";
+           k.k->p.snapshot) {
+               prt_printf(err, "nonzero snapshot");
+               return -EINVAL;
+       }
 
        if (type != BKEY_TYPE_btree &&
            btree_type_has_snapshots(type) &&
-           !k.k->p.snapshot)
-               return "invalid snapshot field";
+           !k.k->p.snapshot) {
+               prt_printf(err, "snapshot == 0");
+               return -EINVAL;
+       }
 
        if (type != BKEY_TYPE_btree &&
-           !bkey_cmp(k.k->p, POS_MAX))
-               return "POS_MAX key";
+           !bkey_cmp(k.k->p, POS_MAX)) {
+               prt_printf(err, "key at POS_MAX");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
-const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-                             enum btree_node_type type)
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+                     enum btree_node_type type,
+                     int rw, struct printbuf *err)
 {
-       return __bch2_bkey_invalid(c, k, type) ?:
-               bch2_bkey_val_invalid(c, k);
+       return __bch2_bkey_invalid(c, k, type, rw, err) ?:
+               bch2_bkey_val_invalid(c, k, rw, err);
 }
 
-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
+                           struct printbuf *err)
 {
-       if (bpos_cmp(k.k->p, b->data->min_key) < 0)
-               return "key before start of btree node";
+       if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
+               prt_printf(err, "key before start of btree node");
+               return -EINVAL;
+       }
 
-       if (bpos_cmp(k.k->p, b->data->max_key) > 0)
-               return "key past end of btree node";
+       if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
+               prt_printf(err, "key past end of btree node");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
        if (!bpos_cmp(pos, POS_MIN))
-               pr_buf(out, "POS_MIN");
+               prt_printf(out, "POS_MIN");
        else if (!bpos_cmp(pos, POS_MAX))
-               pr_buf(out, "POS_MAX");
+               prt_printf(out, "POS_MAX");
        else if (!bpos_cmp(pos, SPOS_MAX))
-               pr_buf(out, "SPOS_MAX");
+               prt_printf(out, "SPOS_MAX");
        else {
                if (pos.inode == U64_MAX)
-                       pr_buf(out, "U64_MAX");
+                       prt_printf(out, "U64_MAX");
                else
-                       pr_buf(out, "%llu", pos.inode);
-               pr_buf(out, ":");
+                       prt_printf(out, "%llu", pos.inode);
+               prt_printf(out, ":");
                if (pos.offset == U64_MAX)
-                       pr_buf(out, "U64_MAX");
+                       prt_printf(out, "U64_MAX");
                else
-                       pr_buf(out, "%llu", pos.offset);
-               pr_buf(out, ":");
+                       prt_printf(out, "%llu", pos.offset);
+               prt_printf(out, ":");
                if (pos.snapshot == U32_MAX)
-                       pr_buf(out, "U32_MAX");
+                       prt_printf(out, "U32_MAX");
                else
-                       pr_buf(out, "%u", pos.snapshot);
+                       prt_printf(out, "%u", pos.snapshot);
        }
 }
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
        if (k) {
-               pr_buf(out, "u64s %u type ", k->u64s);
+               prt_printf(out, "u64s %u type ", k->u64s);
 
                if (k->type < KEY_TYPE_MAX)
-                       pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+                       prt_printf(out, "%s ", bch2_bkey_types[k->type]);
                else
-                       pr_buf(out, "%u ", k->type);
+                       prt_printf(out, "%u ", k->type);
 
                bch2_bpos_to_text(out, k->p);
 
-               pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
+               prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
        } else {
-               pr_buf(out, "(null)");
+               prt_printf(out, "(null)");
        }
 }
 
@@ -265,7 +335,7 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
                if (likely(ops->val_to_text))
                        ops->val_to_text(out, c, k);
        } else {
-               pr_buf(out, "(invalid type %u)", k.k->type);
+               prt_printf(out, "(invalid type %u)", k.k->type);
        }
 }
 
@@ -275,7 +345,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_to_text(out, k.k);
 
        if (bkey_val_bytes(k.k)) {
-               pr_buf(out, ": ");
+               prt_printf(out, ": ");
                bch2_val_to_text(out, c, k);
        }
 }
index 4fdac545cf88af8f2425f30477efd54bbf568d27..db894b40d2ca4180e1e91f398cc3c7021fc68491 100644 (file)
@@ -6,20 +6,31 @@
 
 struct bch_fs;
 struct btree;
+struct btree_trans;
 struct bkey;
 enum btree_node_type;
 
 extern const char * const bch2_bkey_types[];
 
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+*/
 struct bkey_ops {
-       /* Returns reason for being invalid if invalid, else NULL: */
-       const char *    (*key_invalid)(const struct bch_fs *,
-                                      struct bkey_s_c);
+       int             (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+                                      int rw, struct printbuf *err);
        void            (*val_to_text)(struct printbuf *, struct bch_fs *,
                                       struct bkey_s_c);
        void            (*swab)(struct bkey_s);
        bool            (*key_normalize)(struct bch_fs *, struct bkey_s);
        bool            (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+       int             (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+                                        struct bkey_s_c, struct bkey_i *, unsigned);
+       int             (*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
+                                         struct bkey_s_c, unsigned);
        void            (*compat)(enum btree_id id, unsigned version,
                                  unsigned big_endian, int write,
                                  struct bkey_s);
@@ -27,12 +38,12 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
-const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-                               enum btree_node_type);
-const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-                             enum btree_node_type);
-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+                       enum btree_node_type, int, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+                     enum btree_node_type, int, struct printbuf *);
+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@@ -57,6 +68,92 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
+static inline int bch2_mark_key(struct btree_trans *trans,
+                 struct bkey_s_c old,
+                 struct bkey_s_c new,
+                 unsigned flags)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+       return ops->atomic_trigger
+               ? ops->atomic_trigger(trans, old, new, flags)
+               : 0;
+}
+
+enum btree_update_flags {
+       __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+       __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
+
+       __BTREE_TRIGGER_INSERT,
+       __BTREE_TRIGGER_OVERWRITE,
+
+       __BTREE_TRIGGER_GC,
+       __BTREE_TRIGGER_BUCKET_INVALIDATE,
+       __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT           (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE                (1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
+       ((1U << KEY_TYPE_alloc)|                \
+        (1U << KEY_TYPE_alloc_v2)|             \
+        (1U << KEY_TYPE_alloc_v3)|             \
+        (1U << KEY_TYPE_alloc_v4)|             \
+        (1U << KEY_TYPE_stripe)|               \
+        (1U << KEY_TYPE_inode)|                \
+        (1U << KEY_TYPE_inode_v2)|             \
+        (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_s_c old, struct bkey_i *new,
+                                     unsigned flags)
+{
+       const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+       return ops->trans_trigger
+               ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+               : 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_s_c old, unsigned flags)
+{
+       struct bkey_i deleted;
+
+       bkey_init(&deleted.k);
+       deleted.k.p = old.k->p;
+
+       return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+                                  BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_i *new, unsigned flags)
+{
+       struct bkey_i deleted;
+
+       bkey_init(&deleted.k);
+       deleted.k.p = new->k.p;
+
+       return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+                                  BTREE_TRIGGER_INSERT|flags);
+}
+
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
index b1385a77da1146f6efd643d389a73aa999745244..8518054a23817cbf06bd6b8c371162096995fabb 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "bkey_buf.h"
+#include "bkey_cmp.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -155,7 +156,7 @@ static inline int sort_keys_cmp(struct btree *b,
                                struct bkey_packed *l,
                                struct bkey_packed *r)
 {
-       return bch2_bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
                (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
                (int) l->needs_whiteout - (int) r->needs_whiteout;
 }
index 6000a8796bc55326b47ed4e535f9e69962799f78..09423536447049066def097c1b6c32b3ab4657d1 100644 (file)
@@ -70,7 +70,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
        struct bkey_packed *_k, *_n;
        struct bkey uk, n;
        struct bkey_s_c k;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
 
        if (!i->u64s)
                return;
@@ -81,12 +81,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
                _n = bkey_next(_k);
 
                k = bkey_disassemble(b, _k, &uk);
+
+               printbuf_reset(&buf);
                if (c)
-                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       bch2_bkey_val_to_text(&buf, c, k);
                else
-                       bch2_bkey_to_text(&PBUF(buf), k.k);
+                       bch2_bkey_to_text(&buf, k.k);
                printk(KERN_ERR "block %u key %5zu: %s\n", set,
-                      _k->_data - i->_data, buf);
+                      _k->_data - i->_data, buf.buf);
 
                if (_n == vstruct_last(i))
                        continue;
@@ -102,6 +104,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
                    !bpos_cmp(n.p, k.k->p))
                        printk(KERN_ERR "Duplicate keys\n");
        }
+
+       printbuf_exit(&buf);
 }
 
 void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
@@ -118,6 +122,7 @@ void bch2_dump_btree_node_iter(struct btree *b,
                              struct btree_node_iter *iter)
 {
        struct btree_node_iter_set *set;
+       struct printbuf buf = PRINTBUF;
 
        printk(KERN_ERR "btree node iter with %u/%u sets:\n",
               __btree_node_iter_used(iter), b->nsets);
@@ -126,12 +131,14 @@ void bch2_dump_btree_node_iter(struct btree *b,
                struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
                struct bset_tree *t = bch2_bkey_to_bset(b, k);
                struct bkey uk = bkey_unpack_key(b, k);
-               char buf[100];
 
-               bch2_bkey_to_text(&PBUF(buf), &uk);
+               printbuf_reset(&buf);
+               bch2_bkey_to_text(&buf, &uk);
                printk(KERN_ERR "set %zu key %u: %s\n",
-                      t - b->set, set->k, buf);
+                      t - b->set, set->k, buf.buf);
        }
+
+       printbuf_exit(&buf);
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -167,13 +174,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
                struct btree_node_iter_set *set;
                struct bkey ku = bkey_unpack_key(b, k);
                struct bkey nu = bkey_unpack_key(b, n);
-               char buf1[80], buf2[80];
+               struct printbuf buf1 = PRINTBUF;
+               struct printbuf buf2 = PRINTBUF;
 
                bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&PBUF(buf1), &ku);
-               bch2_bkey_to_text(&PBUF(buf2), &nu);
+               bch2_bkey_to_text(&buf1, &ku);
+               bch2_bkey_to_text(&buf2, &nu);
                printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-                      buf1, buf2);
+                      buf1.buf, buf2.buf);
                printk(KERN_ERR "iter was:");
 
                btree_node_iter_for_each(_iter, set) {
@@ -238,6 +246,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
        struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
        struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
 #if 0
        BUG_ON(prev &&
               bkey_iter_cmp(b, prev, insert) > 0);
@@ -246,17 +256,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
            bkey_iter_cmp(b, prev, insert) > 0) {
                struct bkey k1 = bkey_unpack_key(b, prev);
                struct bkey k2 = bkey_unpack_key(b, insert);
-               char buf1[100];
-               char buf2[100];
 
                bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&PBUF(buf1), &k1);
-               bch2_bkey_to_text(&PBUF(buf2), &k2);
+               bch2_bkey_to_text(&buf1, &k1);
+               bch2_bkey_to_text(&buf2, &k2);
 
                panic("prev > insert:\n"
                      "prev    key %s\n"
                      "insert  key %s\n",
-                     buf1, buf2);
+                     buf1.buf, buf2.buf);
        }
 #endif
 #if 0
@@ -267,17 +275,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
            bkey_iter_cmp(b, insert, next) > 0) {
                struct bkey k1 = bkey_unpack_key(b, insert);
                struct bkey k2 = bkey_unpack_key(b, next);
-               char buf1[100];
-               char buf2[100];
 
                bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&PBUF(buf1), &k1);
-               bch2_bkey_to_text(&PBUF(buf2), &k2);
+               bch2_bkey_to_text(&buf1, &k1);
+               bch2_bkey_to_text(&buf2, &k2);
 
                panic("insert > next:\n"
                      "insert  key %s\n"
                      "next    key %s\n",
-                     buf1, buf2);
+                     buf1.buf, buf2.buf);
        }
 #endif
 }
@@ -959,7 +965,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
        t->size -= j - l;
 
        for (j = l; j < t->size; j++)
-              rw_aux_tree(b, t)[j].offset += shift;
+               rw_aux_tree(b, t)[j].offset += shift;
 
        EBUG_ON(l < t->size &&
                rw_aux_tree(b, t)[l].offset ==
@@ -1260,7 +1266,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
        bch2_btree_node_iter_sort(iter, b);
 }
 
-noinline __flatten __attribute__((cold))
+noinline __flatten __cold
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
                              struct btree *b, struct bpos *search)
 {
@@ -1435,7 +1441,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
        EBUG_ON(iter->data->k > iter->data->end);
 
        if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-               bch2_btree_node_iter_set_drop(iter, iter->data);
+               /* avoid an expensive memmove call: */
+               iter->data[0] = iter->data[1];
+               iter->data[1] = iter->data[2];
+               iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
                return;
        }
 
@@ -1567,9 +1576,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
        struct bkey uk;
        unsigned j, inorder;
 
-       if (out->pos != out->end)
-               *out->pos = '\0';
-
        if (!bset_has_ro_aux_tree(t))
                return;
 
@@ -1584,12 +1590,12 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
        switch (bkey_float(b, t, j)->exponent) {
        case BFLOAT_FAILED:
                uk = bkey_unpack_key(b, k);
-               pr_buf(out,
+               prt_printf(out,
                       "    failed unpacked at depth %u\n"
                       "\t",
                       ilog2(j));
                bch2_bpos_to_text(out, uk.p);
-               pr_buf(out, "\n");
+               prt_printf(out, "\n");
                break;
        }
 }
index 0d46534c3dcd148e872f222c125c144fac115b7a..72e6376bce2af705ee8519abbd3af3202b2cbd53 100644 (file)
@@ -205,100 +205,6 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
        return btree_aux_data_bytes(b) / sizeof(u64);
 }
 
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-                              struct bkey *dst,
-                              const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-       {
-               compiled_unpack_fn unpack_fn = b->aux_data;
-               unpack_fn(dst, src);
-
-               if (bch2_expensive_debug_checks) {
-                       struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-                       BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-               }
-       }
-#else
-       *dst = __bch2_bkey_unpack_key(&b->format, src);
-#endif
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-                              const struct bkey_packed *src)
-{
-       struct bkey dst;
-
-       __bkey_unpack_key_format_checked(b, &dst, src);
-       return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-                                    struct bkey *dst,
-                                    const struct bkey_packed *src)
-{
-       if (likely(bkey_packed(src)))
-               __bkey_unpack_key_format_checked(b, dst, src);
-       else
-               *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-                                         const struct bkey_packed *src)
-{
-       return likely(bkey_packed(src))
-               ? bkey_unpack_key_format_checked(b, src)
-               : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-                              const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-       return bkey_unpack_key_format_checked(b, src).p;
-#else
-       return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-                                         const struct bkey_packed *src)
-{
-       return likely(bkey_packed(src))
-               ? bkey_unpack_pos_format_checked(b, src)
-               : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
-                                              const struct bkey_packed *k,
-                                              struct bkey *u)
-{
-       __bkey_unpack_key(b, u, k);
-
-       return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
-                                              struct bkey_packed *k,
-                                              struct bkey *u)
-{
-       __bkey_unpack_key(b, u, k);
-
-       return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
 #define for_each_bset(_b, _t)                                          \
        for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
index 986d08d708cc9593238e482b226c16fb4d01fe2f..8dd2db4121a6b57db2a2c20fa4c9f3e0ad781428 100644 (file)
@@ -7,13 +7,25 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "errcode.h"
 #include "error.h"
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
-struct lock_class_key bch2_btree_node_lock_key;
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do {                                            \
+       if (shrinker_counter)                    \
+               bc->not_freed_##counter++;       \
+} while (0)
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f)   #f,
+       BTREE_FLAGS()
+#undef x
+       NULL
+};
 
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
@@ -35,6 +47,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
        return max_t(int, 0, bc->used - bc->reserve);
 }
 
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+       if (b->c.lock.readers)
+               list_move(&b->list, &bc->freed_pcpu);
+       else
+               list_move(&b->list, &bc->freed_nonpcpu);
+}
+
 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 {
        struct btree_cache *bc = &c->btree_cache;
@@ -51,7 +71,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
        b->aux_data = NULL;
 
        bc->used--;
-       list_move(&b->list, &bc->freed);
+
+       btree_node_to_freedlist(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -95,14 +116,17 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
        return 0;
 }
 
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 {
-       struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+       struct btree *b = kzalloc(sizeof(struct btree), gfp);
        if (!b)
                return NULL;
 
        bkey_btree_ptr_init(&b->key);
        __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       lockdep_set_no_check_recursion(&b->c.lock.dep_map);
+#endif
        INIT_LIST_HEAD(&b->list);
        INIT_LIST_HEAD(&b->write_blocked);
        b->byte_order = ilog2(btree_bytes(c));
@@ -112,7 +136,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
        struct btree_cache *bc = &c->btree_cache;
-       struct btree *b = __btree_node_mem_alloc(c);
+       struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
        if (!b)
                return NULL;
 
@@ -135,8 +159,6 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 
        /* Cause future lookups for this node to fail: */
        b->hash_val = 0;
-
-       six_lock_wakeup_all(&b->c.lock);
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -156,15 +178,10 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
        b->c.level      = level;
        b->c.btree_id   = id;
 
-       if (level)
-               six_lock_pcpu_alloc(&b->c.lock);
-       else
-               six_lock_pcpu_free_rcu(&b->c.lock);
-
        mutex_lock(&bc->lock);
        ret = __bch2_btree_node_hash_insert(bc, b);
        if (!ret)
-               list_add(&b->list, &bc->live);
+               list_add_tail(&b->list, &bc->live);
        mutex_unlock(&bc->lock);
 
        return ret;
@@ -183,7 +200,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
  * this version is for btree nodes that have already been freed (we're not
  * reaping a real btree node)
  */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
 {
        struct btree_cache *bc = &c->btree_cache;
        int ret = 0;
@@ -193,40 +210,64 @@ wait_on_io:
        if (b->flags & ((1U << BTREE_NODE_dirty)|
                        (1U << BTREE_NODE_read_in_flight)|
                        (1U << BTREE_NODE_write_in_flight))) {
-               if (!flush)
+               if (!flush) {
+                       if (btree_node_dirty(b))
+                               BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+                       else if (btree_node_read_in_flight(b))
+                               BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+                       else if (btree_node_write_in_flight(b))
+                               BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
                        return -ENOMEM;
+               }
 
                /* XXX: waiting on IO with btree cache lock held */
                bch2_btree_node_wait_on_read(b);
                bch2_btree_node_wait_on_write(b);
        }
 
-       if (!six_trylock_intent(&b->c.lock))
+       if (!six_trylock_intent(&b->c.lock)) {
+               BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
                return -ENOMEM;
+       }
 
-       if (!six_trylock_write(&b->c.lock))
+       if (!six_trylock_write(&b->c.lock)) {
+               BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
                goto out_unlock_intent;
+       }
 
        /* recheck under lock */
        if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
                        (1U << BTREE_NODE_write_in_flight))) {
-               if (!flush)
+               if (!flush) {
+                       if (btree_node_read_in_flight(b))
+                               BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+                       else if (btree_node_write_in_flight(b))
+                               BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
                        goto out_unlock;
+               }
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
        }
 
-       if (btree_node_noevict(b))
+       if (btree_node_noevict(b)) {
+               BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
                goto out_unlock;
-
-       if (!btree_node_may_write(b))
+       }
+       if (btree_node_write_blocked(b)) {
+               BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
+               goto out_unlock;
+       }
+       if (btree_node_will_make_reachable(b)) {
+               BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
                goto out_unlock;
+       }
 
        if (btree_node_dirty(b)) {
-               if (!flush ||
-                   test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+               if (!flush) {
+                       BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
                        goto out_unlock;
+               }
                /*
                 * Using the underscore version because we don't want to compact
                 * bsets after the write, since this node is about to be evicted
@@ -234,9 +275,9 @@ wait_on_io:
                 * the post write cleanup:
                 */
                if (bch2_verify_btree_ondisk)
-                       bch2_btree_node_write(c, b, SIX_LOCK_intent);
+                       bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
                else
-                       __bch2_btree_node_write(c, b, false);
+                       __bch2_btree_node_write(c, b, 0);
 
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
@@ -244,7 +285,7 @@ wait_on_io:
        }
 out:
        if (b->hash_val && !ret)
-               trace_btree_node_reap(c, b);
+               trace_and_count(c, btree_cache_reap, c, b);
        return ret;
 out_unlock:
        six_unlock_write(&b->c.lock);
@@ -254,14 +295,14 @@ out_unlock_intent:
        goto out;
 }
 
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
 {
-       return __btree_node_reclaim(c, b, false);
+       return __btree_node_reclaim(c, b, false, shrinker_counter);
 }
 
 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 {
-       return __btree_node_reclaim(c, b, true);
+       return __btree_node_reclaim(c, b, true, false);
 }
 
 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -272,21 +313,18 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b, *t;
        unsigned long nr = sc->nr_to_scan;
-       unsigned long can_free;
-       unsigned long touched = 0;
+       unsigned long can_free = 0;
        unsigned long freed = 0;
+       unsigned long touched = 0;
        unsigned i, flags;
        unsigned long ret = SHRINK_STOP;
+       bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+               bc->used * 3 / 4;
 
        if (bch2_btree_shrinker_disabled)
                return SHRINK_STOP;
 
-       /* Return -1 if we can't do anything right now */
-       if (sc->gfp_mask & __GFP_FS)
-               mutex_lock(&bc->lock);
-       else if (!mutex_trylock(&bc->lock))
-               goto out_norestore;
-
+       mutex_lock(&bc->lock);
        flags = memalloc_nofs_save();
 
        /*
@@ -296,7 +334,6 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
         * succeed, so that inserting keys into the btree can always succeed and
         * IO can always make forward progress:
         */
-       nr /= btree_pages(c);
        can_free = btree_cache_can_free(bc);
        nr = min_t(unsigned long, nr, can_free);
 
@@ -312,61 +349,61 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
                touched++;
 
                if (touched >= nr)
-                       break;
+                       goto out;
 
-               if (!btree_node_reclaim(c, b)) {
+               if (!btree_node_reclaim(c, b, true)) {
                        btree_node_data_free(c, b);
                        six_unlock_write(&b->c.lock);
                        six_unlock_intent(&b->c.lock);
                        freed++;
+                       bc->freed++;
                }
        }
 restart:
        list_for_each_entry_safe(b, t, &bc->live, list) {
                touched++;
 
-               if (touched >= nr) {
-                       /* Save position */
-                       if (&t->list != &bc->live)
-                               list_move_tail(&bc->live, &t->list);
-                       break;
-               }
-
-               if (!btree_node_accessed(b) &&
-                   !btree_node_reclaim(c, b)) {
-                       /* can't call bch2_btree_node_hash_remove under lock  */
+               if (btree_node_accessed(b)) {
+                       clear_btree_node_accessed(b);
+                       bc->not_freed_access_bit++;
+               } else if (!btree_node_reclaim(c, b, true)) {
                        freed++;
-                       if (&t->list != &bc->live)
-                               list_move_tail(&bc->live, &t->list);
-
                        btree_node_data_free(c, b);
-                       mutex_unlock(&bc->lock);
+                       bc->freed++;
 
                        bch2_btree_node_hash_remove(bc, b);
                        six_unlock_write(&b->c.lock);
                        six_unlock_intent(&b->c.lock);
 
-                       if (freed >= nr)
-                               goto out;
-
-                       if (sc->gfp_mask & __GFP_FS)
-                               mutex_lock(&bc->lock);
-                       else if (!mutex_trylock(&bc->lock))
-                               goto out;
+                       if (freed == nr)
+                               goto out_rotate;
+               } else if (trigger_writes &&
+                          btree_node_dirty(b) &&
+                          !btree_node_will_make_reachable(b) &&
+                          !btree_node_write_blocked(b) &&
+                          six_trylock_read(&b->c.lock)) {
+                       list_move(&bc->live, &b->list);
+                       mutex_unlock(&bc->lock);
+                       __bch2_btree_node_write(c, b, 0);
+                       six_unlock_read(&b->c.lock);
+                       if (touched >= nr)
+                               goto out_nounlock;
+                       mutex_lock(&bc->lock);
                        goto restart;
-               } else
-                       clear_btree_node_accessed(b);
-       }
+               }
 
-       mutex_unlock(&bc->lock);
+               if (touched >= nr)
+                       break;
+       }
+out_rotate:
+       if (&t->list != &bc->live)
+               list_move_tail(&bc->live, &t->list);
 out:
-       ret = (unsigned long) freed * btree_pages(c);
+       mutex_unlock(&bc->lock);
+out_nounlock:
+       ret = freed;
        memalloc_nofs_restore(flags);
-out_norestore:
-       trace_btree_cache_scan(sc->nr_to_scan,
-                              sc->nr_to_scan / btree_pages(c),
-                              btree_cache_can_free(bc),
-                              ret);
+       trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
        return ret;
 }
 
@@ -380,7 +417,15 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
        if (bch2_btree_shrinker_disabled)
                return 0;
 
-       return btree_cache_can_free(bc) * btree_pages(c);
+       return btree_cache_can_free(bc);
+}
+
+static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
+{
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_cache.shrink);
+
+       bch2_btree_cache_to_text(out, &c->btree_cache);
 }
 
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
@@ -415,15 +460,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
                if (btree_node_dirty(b))
                        bch2_btree_complete_write(c, b, btree_current_write(b));
-               clear_btree_node_dirty(c, b);
+               clear_btree_node_dirty_acct(c, b);
 
                btree_node_data_free(c, b);
        }
 
        BUG_ON(atomic_read(&c->btree_cache.dirty));
 
-       while (!list_empty(&bc->freed)) {
-               b = list_first_entry(&bc->freed, struct btree, list);
+       list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+       while (!list_empty(&bc->freed_nonpcpu)) {
+               b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
                list_del(&b->list);
                six_lock_pcpu_free(&b->c.lock);
                kfree(b);
@@ -464,9 +511,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
        bc->shrink.count_objects        = bch2_btree_cache_count;
        bc->shrink.scan_objects         = bch2_btree_cache_scan;
+       bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
        bc->shrink.seeks                = 4;
-       bc->shrink.batch                = btree_pages(c) * 2;
-       ret = register_shrinker(&bc->shrink);
+       ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
@@ -477,7 +524,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
        mutex_init(&bc->lock);
        INIT_LIST_HEAD(&bc->live);
        INIT_LIST_HEAD(&bc->freeable);
-       INIT_LIST_HEAD(&bc->freed);
+       INIT_LIST_HEAD(&bc->freed_pcpu);
+       INIT_LIST_HEAD(&bc->freed_nonpcpu);
 }
 
 /*
@@ -491,7 +539,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
        struct btree_cache *bc = &c->btree_cache;
 
        if (bc->alloc_lock == current) {
-               trace_btree_node_cannibalize_unlock(c);
+               trace_and_count(c, btree_cache_cannibalize_unlock, c);
                bc->alloc_lock = NULL;
                closure_wake_up(&bc->alloc_wait);
        }
@@ -507,7 +555,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
                goto success;
 
        if (!cl) {
-               trace_btree_node_cannibalize_lock_fail(c);
+               trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
                return -ENOMEM;
        }
 
@@ -521,11 +569,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
                goto success;
        }
 
-       trace_btree_node_cannibalize_lock_fail(c);
+       trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
        return -EAGAIN;
 
 success:
-       trace_btree_node_cannibalize_lock(c);
+       trace_and_count(c, btree_cache_cannibalize_lock, c);
        return 0;
 }
 
@@ -535,7 +583,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
        struct btree *b;
 
        list_for_each_entry_reverse(b, &bc->live, list)
-               if (!btree_node_reclaim(c, b))
+               if (!btree_node_reclaim(c, b, false))
                        return b;
 
        while (1) {
@@ -552,55 +600,68 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
        }
 }
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
 {
        struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
+       struct list_head *freed = pcpu_read_locks
+               ? &bc->freed_pcpu
+               : &bc->freed_nonpcpu;
+       struct btree *b, *b2;
        u64 start_time = local_clock();
        unsigned flags;
 
        flags = memalloc_nofs_save();
        mutex_lock(&bc->lock);
 
-       /*
-        * btree_free() doesn't free memory; it sticks the node on the end of
-        * the list. Check if there's any freed nodes there:
-        */
-       list_for_each_entry(b, &bc->freeable, list)
-               if (!btree_node_reclaim(c, b))
-                       goto got_node;
-
        /*
         * We never free struct btree itself, just the memory that holds the on
         * disk node. Check the freed list before allocating a new one:
         */
-       list_for_each_entry(b, &bc->freed, list)
-               if (!btree_node_reclaim(c, b))
+       list_for_each_entry(b, freed, list)
+               if (!btree_node_reclaim(c, b, false)) {
+                       list_del_init(&b->list);
                        goto got_node;
+               }
 
-       b = NULL;
-got_node:
-       if (b)
-               list_del_init(&b->list);
-       mutex_unlock(&bc->lock);
-
+       b = __btree_node_mem_alloc(c, __GFP_NOWARN);
        if (!b) {
-               b = __btree_node_mem_alloc(c);
+               mutex_unlock(&bc->lock);
+               b = __btree_node_mem_alloc(c, GFP_KERNEL);
                if (!b)
                        goto err;
-
-               BUG_ON(!six_trylock_intent(&b->c.lock));
-               BUG_ON(!six_trylock_write(&b->c.lock));
+               mutex_lock(&bc->lock);
        }
 
-       if (!b->data) {
-               if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
-                       goto err;
+       if (pcpu_read_locks)
+               six_lock_pcpu_alloc(&b->c.lock);
 
-               mutex_lock(&bc->lock);
-               bc->used++;
-               mutex_unlock(&bc->lock);
-       }
+       BUG_ON(!six_trylock_intent(&b->c.lock));
+       BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+       /*
+        * btree_free() doesn't free memory; it sticks the node on the end of
+        * the list. Check if there's any freed nodes there:
+        */
+       list_for_each_entry(b2, &bc->freeable, list)
+               if (!btree_node_reclaim(c, b2, false)) {
+                       swap(b->data, b2->data);
+                       swap(b->aux_data, b2->aux_data);
+                       btree_node_to_freedlist(bc, b2);
+                       six_unlock_write(&b2->c.lock);
+                       six_unlock_intent(&b2->c.lock);
+                       goto got_mem;
+               }
+
+       mutex_unlock(&bc->lock);
+
+       if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+               goto err;
+
+       mutex_lock(&bc->lock);
+       bc->used++;
+got_mem:
+       mutex_unlock(&bc->lock);
 
        BUG_ON(btree_node_hashed(b));
        BUG_ON(btree_node_dirty(b));
@@ -623,21 +684,25 @@ out:
 err:
        mutex_lock(&bc->lock);
 
-       if (b) {
-               list_add(&b->list, &bc->freed);
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-       }
-
        /* Try to cannibalize another cached btree node: */
        if (bc->alloc_lock == current) {
-               b = btree_node_cannibalize(c);
-               list_del_init(&b->list);
-               mutex_unlock(&bc->lock);
+               b2 = btree_node_cannibalize(c);
+               bch2_btree_node_hash_remove(bc, b2);
+
+               if (b) {
+                       swap(b->data, b2->data);
+                       swap(b->aux_data, b2->aux_data);
+                       btree_node_to_freedlist(bc, b2);
+                       six_unlock_write(&b2->c.lock);
+                       six_unlock_intent(&b2->c.lock);
+               } else {
+                       b = b2;
+                       list_del_init(&b->list);
+               }
 
-               bch2_btree_node_hash_remove(bc, b);
+               mutex_unlock(&bc->lock);
 
-               trace_btree_node_cannibalize(c);
+               trace_and_count(c, btree_cache_cannibalize, c);
                goto out;
        }
 
@@ -666,13 +731,18 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * been freed:
         */
        if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
-               trace_trans_restart_relock_parent_for_fill(trans->fn,
-                                       _THIS_IP_, btree_id, &path->pos);
-               btree_trans_restart(trans);
-               return ERR_PTR(-EINTR);
+               trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+       }
+
+       b = bch2_btree_node_mem_alloc(c, level != 0);
+
+       if (trans && b == ERR_PTR(-ENOMEM)) {
+               trans->memory_allocation_failure = true;
+               trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
        }
 
-       b = bch2_btree_node_mem_alloc(c);
        if (IS_ERR(b))
                return b;
 
@@ -707,52 +777,49 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
        if (!sync)
                return NULL;
 
-       if (trans &&
-           (!bch2_trans_relock(trans) ||
-            !bch2_btree_path_relock_intent(trans, path))) {
-               BUG_ON(!trans->restarted);
-               return ERR_PTR(-EINTR);
+       if (trans) {
+               int ret = bch2_trans_relock(trans) ?:
+                       bch2_btree_path_relock_intent(trans, path);
+               if (ret) {
+                       BUG_ON(!trans->restarted);
+                       return ERR_PTR(ret);
+               }
        }
 
        if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-               trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
-                                          btree_id, &path->pos);
-               btree_trans_restart(trans);
-               return ERR_PTR(-EINTR);
+               if (trans)
+                       trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
        }
 
        return b;
 }
 
-static int lock_node_check_fn(struct six_lock *lock, void *p)
-{
-       struct btree *b = container_of(lock, struct btree, c.lock);
-       const struct bkey_i *k = p;
-
-       return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
-}
-
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
-       char buf1[200], buf2[100], buf3[100];
+       struct printbuf buf = PRINTBUF;
 
        if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
                return;
 
-       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
-       bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
-       bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
-
-       bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
-                            "btree %s level %u\n"
-                            "ptr: %s\n"
-                            "header: btree %s level %llu\n"
-                            "min %s max %s\n",
-                            bch2_btree_ids[b->c.btree_id], b->c.level,
-                            buf1,
-                            bch2_btree_ids[BTREE_NODE_ID(b->data)],
-                            BTREE_NODE_LEVEL(b->data),
-                            buf2, buf3);
+       prt_printf(&buf,
+              "btree node header doesn't match ptr\n"
+              "btree %s level %u\n"
+              "ptr: ",
+              bch2_btree_ids[b->c.btree_id], b->c.level);
+       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+       prt_printf(&buf, "\nheader: btree %s level %llu\n"
+              "min ",
+              bch2_btree_ids[BTREE_NODE_ID(b->data)],
+              BTREE_NODE_LEVEL(b->data));
+       bch2_bpos_to_text(&buf, b->data->min_key);
+
+       prt_printf(&buf, "\nmax ");
+       bch2_bpos_to_text(&buf, b->data->max_key);
+
+       bch2_fs_inconsistent(c, "%s", buf.buf);
+       printbuf_exit(&buf);
 }
 
 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
@@ -784,6 +851,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
        struct bset_tree *t;
+       int ret;
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
@@ -797,7 +865,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
        if (likely(c->opts.btree_node_mem_ptr_optimization &&
                   b &&
                   b->hash_val == btree_ptr_hash_val(k)))
-                       goto lock_node;
+               goto lock_node;
 retry:
        b = btree_cache_find(bc, k);
        if (unlikely(!b)) {
@@ -846,14 +914,13 @@ lock_node:
                 * was removed - and we'll bail out:
                 */
                if (btree_node_read_locked(path, level + 1))
-                       btree_node_unlock(path, level + 1);
+                       btree_node_unlock(trans, path, level + 1);
 
-               if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
-                                    lock_node_check_fn, (void *) k, trace_ip)) {
-                       if (!trans->restarted)
-                               goto retry;
-                       return ERR_PTR(-EINTR);
-               }
+               ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       return ERR_PTR(ret);
+
+               BUG_ON(ret);
 
                if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
                             b->c.level != level ||
@@ -862,12 +929,8 @@ lock_node:
                        if (bch2_btree_node_relock(trans, path, level + 1))
                                goto retry;
 
-                       trace_trans_restart_btree_node_reused(trans->fn,
-                                                             trace_ip,
-                                                             path->btree_id,
-                                                             &path->pos);
-                       btree_trans_restart(trans);
-                       return ERR_PTR(-EINTR);
+                       trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+                       return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
                }
        }
 
@@ -883,11 +946,13 @@ lock_node:
                 * should_be_locked is not set on this path yet, so we need to
                 * relock it specifically:
                 */
-               if (trans &&
-                   (!bch2_trans_relock(trans) ||
-                    !bch2_btree_path_relock_intent(trans, path))) {
-                       BUG_ON(!trans->restarted);
-                       return ERR_PTR(-EINTR);
+               if (trans) {
+                       int ret = bch2_trans_relock(trans) ?:
+                               bch2_btree_path_relock_intent(trans, path);
+                       if (ret) {
+                               BUG_ON(!trans->restarted);
+                               return ERR_PTR(ret);
+                       }
                }
 
                if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -920,12 +985,13 @@ lock_node:
        return b;
 }
 
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
                                         const struct bkey_i *k,
                                         enum btree_id btree_id,
                                         unsigned level,
                                         bool nofill)
 {
+       struct bch_fs *c = trans->c;
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
        struct bset_tree *t;
@@ -959,9 +1025,11 @@ retry:
                        goto out;
        } else {
 lock_node:
-               ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
-               if (ret)
-                       goto retry;
+               ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       return ERR_PTR(ret);
+
+               BUG_ON(ret);
 
                if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
                             b->c.btree_id != btree_id ||
@@ -1023,8 +1091,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
        return PTR_ERR_OR_ZERO(b);
 }
 
-void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
 {
+       struct bch_fs *c = trans->c;
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
@@ -1036,15 +1105,15 @@ wait_on_io:
 
        /* XXX we're called from btree_gc which will be holding other btree
         * nodes locked
-        * */
+        */
        __bch2_btree_node_wait_on_read(b);
        __bch2_btree_node_wait_on_write(b);
 
-       six_lock_intent(&b->c.lock, NULL, NULL);
-       six_lock_write(&b->c.lock, NULL, NULL);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
        if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, 0);
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
@@ -1071,15 +1140,15 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
        bch2_btree_keys_stats(b, &stats);
 
-       pr_buf(out, "l %u ", b->c.level);
+       prt_printf(out, "l %u ", b->c.level);
        bch2_bpos_to_text(out, b->data->min_key);
-       pr_buf(out, " - ");
+       prt_printf(out, " - ");
        bch2_bpos_to_text(out, b->data->max_key);
-       pr_buf(out, ":\n"
+       prt_printf(out, ":\n"
               "    ptrs: ");
        bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 
-       pr_buf(out, "\n"
+       prt_printf(out, "\n"
               "    format: u64s %u fields %u %u %u %u %u\n"
               "    unpack fn len: %u\n"
               "    bytes used %zu/%zu (%zu%% full)\n"
@@ -1107,9 +1176,21 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               stats.failed);
 }
 
-void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
 {
-       pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
-       pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
-       pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+       prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
+       prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
+       prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+
+       prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
+       prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
+       prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
+       prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
+       prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
+       prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
+       prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
+       prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
+       prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
+       prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
+
 }
index f7e10986f317cc2036abcb143648bc721c0c2eb0..b623c70282730336790e5553f1670e93c777b736 100644 (file)
@@ -4,8 +4,9 @@
 
 #include "bcachefs.h"
 #include "btree_types.h"
+#include "bkey_methods.h"
 
-extern struct lock_class_key bch2_btree_node_lock_key;
+extern const char * const bch2_btree_node_flags[];
 
 struct btree_iter;
 
@@ -20,19 +21,19 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
 
 struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
                                  const struct bkey_i *, unsigned,
                                  enum six_lock_type, unsigned long);
 
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
                                         enum btree_id, unsigned, bool);
 
 int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
                             const struct bkey_i *, enum btree_id, unsigned);
 
-void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
 
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
@@ -100,6 +101,6 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
                             struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
index 648779cc643d225e63aad0175d981b0560081b6d..801a09f6fc1141f595cfc89b2cc4b151fbb85978 100644 (file)
@@ -70,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c,
        struct bpos expected_start = bkey_deleted(&prev->k->k)
                ? node_start
                : bpos_successor(prev->k->k.p);
-       char buf1[200], buf2[200];
+       struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
        int ret = 0;
 
        if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
                struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-               if (bkey_deleted(&prev->k->k)) {
-                       struct printbuf out = PBUF(buf1);
-                       pr_buf(&out, "start of node: ");
-                       bch2_bpos_to_text(&out, node_start);
-               } else {
-                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
-               }
-
                if (bpos_cmp(expected_start, bp->v.min_key)) {
                        bch2_topology_error(c);
 
+                       if (bkey_deleted(&prev->k->k)) {
+                               prt_printf(&buf1, "start of node: ");
+                               bch2_bpos_to_text(&buf1, node_start);
+                       } else {
+                               bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+                       }
+                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
                        if (__fsck_err(c,
                                  FSCK_CAN_FIX|
                                  FSCK_CAN_IGNORE|
@@ -95,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                                  "  prev %s\n"
                                  "  cur %s",
                                  bch2_btree_ids[b->c.btree_id], b->c.level,
-                                 buf1,
-                                 (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+                                 buf1.buf, buf2.buf) &&
                            !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
                                bch_info(c, "Halting mark and sweep to start topology repair pass");
-                               return FSCK_ERR_START_TOPOLOGY_REPAIR;
+                               ret = -BCH_ERR_need_topology_repair;
+                               goto err;
                        } else {
                                set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
                        }
@@ -109,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
        if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
                bch2_topology_error(c);
 
+               printbuf_reset(&buf1);
+               printbuf_reset(&buf2);
+
+               bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+               bch2_bpos_to_text(&buf2, node_end);
+
                if (__fsck_err(c,
                          FSCK_CAN_FIX|
                          FSCK_CAN_IGNORE|
@@ -117,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                          "  %s\n"
                          "  expected %s",
                          bch2_btree_ids[b->c.btree_id], b->c.level,
-                         (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-                         (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+                         buf1.buf, buf2.buf) &&
                    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
                        bch_info(c, "Halting mark and sweep to start topology repair pass");
-                       return FSCK_ERR_START_TOPOLOGY_REPAIR;
+                       ret = -BCH_ERR_need_topology_repair;
+                       goto err;
                } else {
                        set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
                }
        }
 
        bch2_bkey_buf_copy(prev, c, cur.k);
+err:
 fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
        return ret;
 }
 
@@ -156,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
        }
 }
 
-static void bch2_btree_node_update_key_early(struct bch_fs *c,
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
                                             enum btree_id btree, unsigned level,
                                             struct bkey_s_c old, struct bkey_i *new)
 {
+       struct bch_fs *c = trans->c;
        struct btree *b;
        struct bkey_buf tmp;
        int ret;
@@ -167,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c,
        bch2_bkey_buf_init(&tmp);
        bch2_bkey_buf_reassemble(&tmp, c, old);
 
-       b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+       b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
        if (!IS_ERR_OR_NULL(b)) {
                mutex_lock(&c->btree_cache.lock);
 
@@ -205,7 +215,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
        }
 
        bch2_btree_node_drop_keys_outside_node(b);
-
+       bkey_copy(&b->key, &new->k_i);
        return 0;
 }
 
@@ -251,18 +261,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
        struct bpos expected_start = !prev
                ? b->data->min_key
                : bpos_successor(prev->key.k.p);
-       char buf1[200], buf2[200];
+       struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
        int ret = 0;
 
        if (!prev) {
-               struct printbuf out = PBUF(buf1);
-               pr_buf(&out, "start of node: ");
-               bch2_bpos_to_text(&out, b->data->min_key);
+               prt_printf(&buf1, "start of node: ");
+               bch2_bpos_to_text(&buf1, b->data->min_key);
        } else {
-               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+               bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
        }
 
-       bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
 
        if (prev &&
            bpos_cmp(expected_start, cur->data->min_key) > 0 &&
@@ -275,8 +284,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
                                "  node %s\n"
                                "  next %s",
                                bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1, buf2))
-                       return DROP_PREV_NODE;
+                               buf1.buf, buf2.buf)) {
+                       ret = DROP_PREV_NODE;
+                       goto out;
+               }
 
                if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
                                                 bpos_predecessor(cur->data->min_key)), c,
@@ -284,7 +295,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
                                "  node %s\n"
                                "  next %s",
                                bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1, buf2))
+                               buf1.buf, buf2.buf))
                        ret = set_node_max(c, prev,
                                           bpos_predecessor(cur->data->min_key));
        } else {
@@ -296,50 +307,61 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
                                "  prev %s\n"
                                "  node %s",
                                bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1, buf2))
-                       return DROP_THIS_NODE;
+                               buf1.buf, buf2.buf)) {
+                       ret = DROP_THIS_NODE;
+                       goto out;
+               }
 
                if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
                                "btree node with incorrect min_key at btree %s level %u:\n"
                                "  prev %s\n"
                                "  node %s",
                                bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1, buf2))
-                   ret = set_node_min(c, cur, expected_start);
+                               buf1.buf, buf2.buf))
+                       ret = set_node_min(c, cur, expected_start);
        }
+out:
 fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
        return ret;
 }
 
 static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
                                 struct btree *child)
 {
-       char buf1[200], buf2[200];
+       struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
        int ret = 0;
 
+       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+       bch2_bpos_to_text(&buf2, b->key.k.p);
+
        if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
                        "btree node with incorrect max_key at btree %s level %u:\n"
                        "  %s\n"
                        "  expected %s",
                        bch2_btree_ids[b->c.btree_id], b->c.level,
-                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
-                       (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+                       buf1.buf, buf2.buf)) {
                ret = set_node_max(c, child, b->key.k.p);
                if (ret)
-                       return ret;
+                       goto err;
        }
+err:
 fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
        return ret;
 }
 
-static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
 {
+       struct bch_fs *c = trans->c;
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
        struct bkey_buf prev_k, cur_k;
        struct btree *prev = NULL, *cur = NULL;
        bool have_child, dropped_children = false;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (!b->c.level)
@@ -358,28 +380,32 @@ again:
                bch2_btree_and_journal_iter_advance(&iter);
                bch2_bkey_buf_reassemble(&cur_k, c, k);
 
-               cur = bch2_btree_node_get_noiter(c, cur_k.k,
+               cur = bch2_btree_node_get_noiter(trans, cur_k.k,
                                        b->c.btree_id, b->c.level - 1,
                                        false);
                ret = PTR_ERR_OR_ZERO(cur);
 
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
                if (mustfix_fsck_err_on(ret == -EIO, c,
-                               "Unreadable btree node at btree %s level %u:\n"
+                               "Topology repair: unreadable btree node at btree %s level %u:\n"
                                "  %s",
                                bch2_btree_ids[b->c.btree_id],
                                b->c.level - 1,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
-                       bch2_btree_node_evict(c, cur_k.k);
+                               buf.buf)) {
+                       bch2_btree_node_evict(trans, cur_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, cur_k.k->k.p);
+                       cur = NULL;
                        if (ret)
                                break;
                        continue;
                }
 
                if (ret) {
-                       bch_err(c, "%s: error %i getting btree node",
-                               __func__, ret);
+                       bch_err(c, "%s: error getting btree node: %s",
+                               __func__, bch2_err_str(ret));
                        break;
                }
 
@@ -387,9 +413,10 @@ again:
 
                if (ret == DROP_THIS_NODE) {
                        six_unlock_read(&cur->c.lock);
-                       bch2_btree_node_evict(c, cur_k.k);
+                       bch2_btree_node_evict(trans, cur_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, cur_k.k->k.p);
+                       cur = NULL;
                        if (ret)
                                break;
                        continue;
@@ -400,7 +427,7 @@ again:
                prev = NULL;
 
                if (ret == DROP_PREV_NODE) {
-                       bch2_btree_node_evict(c, prev_k.k);
+                       bch2_btree_node_evict(trans, prev_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, prev_k.k->k.p);
                        if (ret)
@@ -440,23 +467,23 @@ again:
                bch2_bkey_buf_reassemble(&cur_k, c, k);
                bch2_btree_and_journal_iter_advance(&iter);
 
-               cur = bch2_btree_node_get_noiter(c, cur_k.k,
+               cur = bch2_btree_node_get_noiter(trans, cur_k.k,
                                        b->c.btree_id, b->c.level - 1,
                                        false);
                ret = PTR_ERR_OR_ZERO(cur);
 
                if (ret) {
-                       bch_err(c, "%s: error %i getting btree node",
-                               __func__, ret);
+                       bch_err(c, "%s: error getting btree node: %s",
+                               __func__, bch2_err_str(ret));
                        goto err;
                }
 
-               ret = bch2_btree_repair_topology_recurse(c, cur);
+               ret = bch2_btree_repair_topology_recurse(trans, cur);
                six_unlock_read(&cur->c.lock);
                cur = NULL;
 
                if (ret == DROP_THIS_NODE) {
-                       bch2_btree_node_evict(c, cur_k.k);
+                       bch2_btree_node_evict(trans, cur_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, cur_k.k->k.p);
                        dropped_children = true;
@@ -468,12 +495,14 @@ again:
                have_child = true;
        }
 
+       printbuf_reset(&buf);
+       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
        if (mustfix_fsck_err_on(!have_child, c,
                        "empty interior btree node at btree %s level %u\n"
                        "  %s",
                        bch2_btree_ids[b->c.btree_id],
-                       b->c.level,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+                       b->c.level, buf.buf))
                ret = DROP_THIS_NODE;
 err:
 fsck_err:
@@ -489,42 +518,49 @@ fsck_err:
        if (!ret && dropped_children)
                goto again;
 
+       printbuf_exit(&buf);
        return ret;
 }
 
 static int bch2_repair_topology(struct bch_fs *c)
 {
+       struct btree_trans trans;
        struct btree *b;
        unsigned i;
        int ret = 0;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
        for (i = 0; i < BTREE_ID_NR && !ret; i++) {
                b = c->btree_roots[i].b;
                if (btree_node_fake(b))
                        continue;
 
-               six_lock_read(&b->c.lock, NULL, NULL);
-               ret = bch2_btree_repair_topology_recurse(c, b);
+               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               ret = bch2_btree_repair_topology_recurse(&trans, b);
                six_unlock_read(&b->c.lock);
 
                if (ret == DROP_THIS_NODE) {
                        bch_err(c, "empty btree root - repair unimplemented");
-                       ret = FSCK_ERR_EXIT;
+                       ret = -BCH_ERR_fsck_repair_unimplemented;
                }
        }
 
+       bch2_trans_exit(&trans);
+
        return ret;
 }
 
-static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
                               unsigned level, bool is_root,
                               struct bkey_s_c *k)
 {
+       struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p = { 0 };
        bool do_update = false;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        /*
@@ -536,72 +572,78 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-               if (fsck_err_on(!g->gen_valid, c,
+               if (c->opts.reconstruct_alloc ||
+                   fsck_err_on(!g->gen_valid, c,
                                "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
                                p.ptr.gen,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (!p.ptr.cached) {
-                               g->_mark.gen            = p.ptr.gen;
                                g->gen_valid            = true;
+                               g->gen                  = p.ptr.gen;
                        } else {
                                do_update = true;
                        }
                }
 
-               if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+               if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
                                "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-                               p.ptr.gen, g->mark.gen,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+                               p.ptr.gen, g->gen,
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (!p.ptr.cached) {
-                               g->_mark.gen            = p.ptr.gen;
                                g->gen_valid            = true;
-                               g->_mark.data_type      = 0;
-                               g->_mark.dirty_sectors  = 0;
-                               g->_mark.cached_sectors = 0;
+                               g->gen                  = p.ptr.gen;
+                               g->data_type            = 0;
+                               g->dirty_sectors        = 0;
+                               g->cached_sectors       = 0;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
                        }
                }
 
-               if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+               if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
                                "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                                "while marking %s",
-                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
                                p.ptr.gen,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
                        do_update = true;
 
                if (fsck_err_on(!p.ptr.cached &&
-                               gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+                               gen_cmp(p.ptr.gen, g->gen) < 0, c,
                                "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-                               p.ptr.gen, g->mark.gen,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+                               p.ptr.gen, g->gen,
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
                        do_update = true;
 
-               if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+               if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
                        continue;
 
-               if (fsck_err_on(g->mark.data_type &&
-                               g->mark.data_type != data_type, c,
+               if (fsck_err_on(g->data_type &&
+                               g->data_type != data_type, c,
                                "bucket %u:%zu different types of data in same bucket: %s, %s\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                               bch2_data_types[g->mark.data_type],
+                               bch2_data_types[g->data_type],
                                bch2_data_types[data_type],
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (data_type == BCH_DATA_btree) {
-                               g->_mark.data_type      = data_type;
+                               g->data_type    = data_type;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
@@ -615,14 +657,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                        "pointer to nonexistent stripe %llu\n"
                                        "while marking %s",
                                        (u64) p.ec.idx,
-                                       (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+                                       (printbuf_reset(&buf),
+                                        bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
                                do_update = true;
 
                        if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
                                        "pointer does not match stripe %llu\n"
                                        "while marking %s",
                                        (u64) p.ec.idx,
-                                       (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+                                       (printbuf_reset(&buf),
+                                        bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
                                do_update = true;
                }
        }
@@ -635,13 +679,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 
                if (is_root) {
                        bch_err(c, "cannot update btree roots yet");
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
 
                new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
                if (!new) {
                        bch_err(c, "%s: error allocating new key", __func__);
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto err;
                }
 
                bkey_reassemble(new, *k);
@@ -657,7 +703,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
                                struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
-                               ptr->gen = g->mark.gen;
+                               ptr->gen = g->gen;
                        }
                } else {
                        bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
@@ -666,12 +712,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
                                (ptr->cached &&
-                                (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+                                (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
                                (!ptr->cached &&
-                                gen_cmp(ptr->gen, g->mark.gen) < 0) ||
-                               gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
-                               (g->mark.data_type &&
-                                g->mark.data_type != data_type);
+                                gen_cmp(ptr->gen, g->gen) < 0) ||
+                               gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+                               (g->data_type &&
+                                g->data_type != data_type);
                        }));
 again:
                        ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
@@ -705,19 +751,27 @@ found:
                ret = bch2_journal_key_insert_take(c, btree_id, level, new);
                if (ret) {
                        kfree(new);
-                       return ret;
+                       goto err;
                }
 
                if (level)
-                       bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+                       bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+               if (c->opts.verbose) {
+                       printbuf_reset(&buf);
+                       bch2_bkey_val_to_text(&buf, c, *k);
+                       bch_info(c, "updated %s", buf.buf);
+
+                       printbuf_reset(&buf);
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+                       bch_info(c, "new key %s", buf.buf);
+               }
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, *k);
-               bch_info(c, "updated %s", buf);
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
-               bch_info(c, "new key %s", buf);
                *k = bkey_i_to_s_c(new);
        }
+err:
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -740,9 +794,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 
        if (initial) {
                BUG_ON(bch2_journal_seq_verify &&
-                      k->k->version.lo > journal_cur_seq(&c->journal));
+                      k->k->version.lo > atomic64_read(&c->journal.seq));
 
-               ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+               ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
                if (ret)
                        goto err;
 
@@ -753,11 +807,12 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
                        atomic64_set(&c->key_version, k->k->version.lo);
        }
 
-       ret = bch2_mark_key(trans, old, *k, flags);
+       ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_mark_key(trans, old, *k, flags));
 fsck_err:
 err:
        if (ret)
-               bch_err(c, "%s: ret %i", __func__, ret);
+               bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
        return ret;
 }
 
@@ -807,10 +862,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct btree *b;
-       unsigned depth = metadata_only                  ? 1
-               : bch2_expensive_debug_checks           ? 0
-               : !btree_node_type_needs_gc(btree_id)   ? 1
-               : 0;
+       unsigned depth = metadata_only ? 1 : 0;
        int ret = 0;
 
        gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -851,7 +903,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -866,7 +918,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
                ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
                                       false, &k, true);
                if (ret) {
-                       bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
+                       bch_err(c, "%s: error from bch2_gc_mark_key: %s",
+                               __func__, bch2_err_str(ret));
                        goto fsck_err;
                }
 
@@ -896,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
                        bch2_bkey_buf_reassemble(&cur, c, k);
                        bch2_btree_and_journal_iter_advance(&iter);
 
-                       child = bch2_btree_node_get_noiter(c, cur.k,
+                       child = bch2_btree_node_get_noiter(trans, cur.k,
                                                b->c.btree_id, b->c.level - 1,
                                                false);
                        ret = PTR_ERR_OR_ZERO(child);
@@ -912,9 +965,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
                                          "  %s",
                                          bch2_btree_ids[b->c.btree_id],
                                          b->c.level - 1,
-                                         (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+                                         (printbuf_reset(&buf),
+                                          bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
                                    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
-                                       ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+                                       ret = -BCH_ERR_need_topology_repair;
                                        bch_info(c, "Halting mark and sweep to start topology repair pass");
                                        goto fsck_err;
                                } else {
@@ -925,8 +979,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
                                        continue;
                                }
                        } else if (ret) {
-                               bch_err(c, "%s: error %i getting btree node",
-                                       __func__, ret);
+                               bch_err(c, "%s: error getting btree node: %s",
+                                       __func__, bch2_err_str(ret));
                                break;
                        }
 
@@ -942,6 +996,7 @@ fsck_err:
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
        bch2_btree_and_journal_iter_exit(&iter);
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -951,11 +1006,8 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree *b;
-       unsigned target_depth = metadata_only           ? 1
-               : bch2_expensive_debug_checks           ? 0
-               : !btree_node_type_needs_gc(btree_id)   ? 1
-               : 0;
-       char buf[100];
+       unsigned target_depth = metadata_only ? 1 : 0;
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        b = c->btree_roots[btree_id].b;
@@ -964,19 +1016,21 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
                return 0;
 
        six_lock_read(&b->c.lock, NULL, NULL);
+       printbuf_reset(&buf);
+       bch2_bpos_to_text(&buf, b->data->min_key);
        if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
-                       "btree root with incorrect min_key: %s",
-                       (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
+                       "btree root with incorrect min_key: %s", buf.buf)) {
                bch_err(c, "repair unimplemented");
-               ret = FSCK_ERR_EXIT;
+               ret = -BCH_ERR_fsck_repair_unimplemented;
                goto fsck_err;
        }
 
+       printbuf_reset(&buf);
+       bch2_bpos_to_text(&buf, b->data->max_key);
        if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
-                       "btree root with incorrect max_key: %s",
-                       (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
+                       "btree root with incorrect max_key: %s", buf.buf)) {
                bch_err(c, "repair unimplemented");
-               ret = FSCK_ERR_EXIT;
+               ret = -BCH_ERR_fsck_repair_unimplemented;
                goto fsck_err;
        }
 
@@ -993,7 +1047,8 @@ fsck_err:
        six_unlock_read(&b->c.lock);
 
        if (ret < 0)
-               bch_err(c, "%s: ret %i", __func__, ret);
+               bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -1012,6 +1067,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 
        bch2_trans_init(&trans, c, 0, 0);
 
+       if (initial)
+               trans.is_initial_gc = true;
+
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
        bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
@@ -1022,7 +1080,7 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
                        : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
        if (ret < 0)
-               bch_err(c, "%s: ret %i", __func__, ret);
+               bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 
        bch2_trans_exit(&trans);
        return ret;
@@ -1113,10 +1171,10 @@ static void bch2_gc_free(struct bch_fs *c)
        genradix_free(&c->gc_stripes);
 
        for_each_member_device(ca, c, i) {
-               kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+               kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
                        sizeof(struct bucket_array) +
                        ca->mi.nbuckets * sizeof(struct bucket));
-               ca->buckets[1] = NULL;
+               ca->buckets_gc = NULL;
 
                free_percpu(ca->usage_gc);
                ca->usage_gc = NULL;
@@ -1130,29 +1188,29 @@ static int bch2_gc_done(struct bch_fs *c,
                        bool initial, bool metadata_only)
 {
        struct bch_dev *ca = NULL;
-       bool verify = !metadata_only && (!initial ||
-                      (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+       struct printbuf buf = PRINTBUF;
+       bool verify = !metadata_only &&
+               !c->opts.reconstruct_alloc &&
+               (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
        unsigned i, dev;
        int ret = 0;
 
        percpu_down_write(&c->mark_lock);
 
 #define copy_field(_f, _msg, ...)                                      \
-       if (dst->_f != src->_f) {                                       \
-               if (verify)                                             \
-                       fsck_err(c, _msg ": got %llu, should be %llu"   \
-                               , ##__VA_ARGS__, dst->_f, src->_f);     \
-               dst->_f = src->_f;                                      \
-       }
+       if (dst->_f != src->_f &&                                       \
+           (!verify ||                                                 \
+            fsck_err(c, _msg ": got %llu, should be %llu"              \
+                     , ##__VA_ARGS__, dst->_f, src->_f)))              \
+               dst->_f = src->_f
 #define copy_stripe_field(_f, _msg, ...)                               \
-       if (dst->_f != src->_f) {                                       \
-               if (verify)                                             \
-                       fsck_err(c, "stripe %zu has wrong "_msg         \
-                               ": got %u, should be %u",               \
-                               iter.pos, ##__VA_ARGS__,                \
-                               dst->_f, src->_f);                      \
-               dst->_f = src->_f;                                      \
-       }
+       if (dst->_f != src->_f &&                                       \
+           (!verify ||                                                 \
+            fsck_err(c, "stripe %zu has wrong "_msg                    \
+                     ": got %u, should be %u",                         \
+                     iter.pos, ##__VA_ARGS__,                          \
+                     dst->_f, src->_f)))                               \
+               dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)                                   \
@@ -1168,7 +1226,6 @@ static int bch2_gc_done(struct bch_fs *c,
                                             dev_usage_u64s());
 
                copy_dev_field(buckets_ec,              "buckets_ec");
-               copy_dev_field(buckets_unavailable,     "buckets_unavailable");
 
                for (i = 0; i < BCH_DATA_NR; i++) {
                        copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
@@ -1200,16 +1257,16 @@ static int bch2_gc_done(struct bch_fs *c,
                for (i = 0; i < c->replicas.nr; i++) {
                        struct bch_replicas_entry *e =
                                cpu_replicas_entry(&c->replicas, i);
-                       char buf[80];
 
                        if (metadata_only &&
                            (e->data_type == BCH_DATA_user ||
                             e->data_type == BCH_DATA_cached))
                                continue;
 
-                       bch2_replicas_entry_to_text(&PBUF(buf), e);
+                       printbuf_reset(&buf);
+                       bch2_replicas_entry_to_text(&buf, e);
 
-                       copy_fs_field(replicas[i], "%s", buf);
+                       copy_fs_field(replicas[i], "%s", buf.buf);
                }
        }
 
@@ -1221,9 +1278,10 @@ fsck_err:
        if (ca)
                percpu_ref_put(&ca->ref);
        if (ret)
-               bch_err(c, "%s: ret %i", __func__, ret);
+               bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 
        percpu_up_write(&c->mark_lock);
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -1243,7 +1301,7 @@ static int bch2_gc_start(struct bch_fs *c,
        }
 
        for_each_member_device(ca, c, i) {
-               BUG_ON(ca->buckets[1]);
+               BUG_ON(ca->buckets_gc);
                BUG_ON(ca->usage_gc);
 
                ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1252,89 +1310,123 @@ static int bch2_gc_start(struct bch_fs *c,
                        percpu_ref_put(&ca->ref);
                        return -ENOMEM;
                }
+
+               this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+                              ca->mi.nbuckets - ca->mi.first_bucket);
        }
 
        return 0;
 }
 
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+                                    struct bch_alloc_v4 r)
+{
+       return  l.gen != r.gen                          ||
+               l.oldest_gen != r.oldest_gen            ||
+               l.data_type != r.data_type              ||
+               l.dirty_sectors != r.dirty_sectors      ||
+               l.cached_sectors != r.cached_sectors     ||
+               l.stripe_redundancy != r.stripe_redundancy ||
+               l.stripe != r.stripe;
+}
+
 static int bch2_alloc_write_key(struct btree_trans *trans,
                                struct btree_iter *iter,
-                               bool initial, bool metadata_only)
+                               struct bkey_s_c k,
+                               bool metadata_only)
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-       struct bucket *g;
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked old_u, new_u, gc_u;
-       struct bkey_alloc_buf *a;
+       struct bucket gc, *b;
+       struct bkey_i_alloc_v4 *a;
+       struct bch_alloc_v4 old, new;
+       enum bch_data_type type;
        int ret;
 
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
+       if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+               return 1;
 
-       old_u = new_u = bch2_alloc_unpack(k);
+       bch2_alloc_to_v4(k, &old);
+       new = old;
 
        percpu_down_read(&c->mark_lock);
-       g       = gc_bucket(ca, iter->pos.offset);
-       gc_u = (struct bkey_alloc_unpacked) {
-               .dev            = iter->pos.inode,
-               .bucket         = iter->pos.offset,
-               .gen            = g->mark.gen,
-               .data_type      = g->mark.data_type,
-               .dirty_sectors  = g->mark.dirty_sectors,
-               .cached_sectors = g->mark.cached_sectors,
-               .read_time      = g->io_time[READ],
-               .write_time     = g->io_time[WRITE],
-               .stripe         = g->stripe,
-               .stripe_redundancy = g->stripe_redundancy,
-       };
+       b = gc_bucket(ca, iter->pos.offset);
+
+       /*
+        * b->data_type doesn't yet include need_discard & need_gc_gen states -
+        * fix that here:
+        */
+       type = __alloc_data_type(b->dirty_sectors,
+                                b->cached_sectors,
+                                b->stripe,
+                                old,
+                                b->data_type);
+       if (b->data_type != type) {
+               struct bch_dev_usage *u;
+
+               preempt_disable();
+               u = this_cpu_ptr(ca->usage_gc);
+               u->d[b->data_type].buckets--;
+               b->data_type = type;
+               u->d[b->data_type].buckets++;
+               preempt_enable();
+       }
+
+       gc = *b;
        percpu_up_read(&c->mark_lock);
 
        if (metadata_only &&
-           gc_u.data_type != BCH_DATA_sb &&
-           gc_u.data_type != BCH_DATA_journal &&
-           gc_u.data_type != BCH_DATA_btree)
+           gc.data_type != BCH_DATA_sb &&
+           gc.data_type != BCH_DATA_journal &&
+           gc.data_type != BCH_DATA_btree)
                return 0;
 
-       if (gen_after(old_u.gen, gc_u.gen))
+       if (gen_after(old.gen, gc.gen))
                return 0;
 
 #define copy_bucket_field(_f)                                          \
-       if (fsck_err_on(new_u._f != gc_u._f, c,                         \
+       if (c->opts.reconstruct_alloc ||                                \
+           fsck_err_on(new._f != gc._f, c,                             \
                        "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
                        ": got %u, should be %u",                       \
                        iter->pos.inode, iter->pos.offset,              \
-                       new_u.gen,                                      \
-                       bch2_data_types[new_u.data_type],               \
-                       new_u._f, gc_u._f))                             \
-               new_u._f = gc_u._f;                                     \
+                       gc.gen,                                         \
+                       bch2_data_types[gc.data_type],                  \
+                       new._f, gc._f))                                 \
+               new._f = gc._f;                                         \
 
        copy_bucket_field(gen);
        copy_bucket_field(data_type);
-       copy_bucket_field(stripe);
        copy_bucket_field(dirty_sectors);
        copy_bucket_field(cached_sectors);
        copy_bucket_field(stripe_redundancy);
        copy_bucket_field(stripe);
 #undef copy_bucket_field
 
-       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+       if (!bch2_alloc_v4_cmp(old, new))
                return 0;
 
-       a = bch2_alloc_pack(trans, new_u);
-       if (IS_ERR(a))
-               return PTR_ERR(a);
+       a = bch2_alloc_to_v4_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(a);
+       if (ret)
+               return ret;
+
+       a->v = new;
 
-       ret = initial
-               ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
-               : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+       /*
+        * The trigger normally makes sure this is set, but we're not running
+        * triggers:
+        */
+       if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+               a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+       ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
 fsck_err:
        return ret;
 }
 
-static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
        struct btree_trans trans;
        struct btree_iter iter;
@@ -1346,37 +1438,33 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_member_device(ca, c, i) {
-               for_each_btree_key(&trans, iter, BTREE_ID_alloc,
-                                  POS(ca->dev_idx, ca->mi.first_bucket),
-                                  BTREE_ITER_SLOTS|
-                                  BTREE_ITER_PREFETCH, k, ret) {
-                       if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
-                               break;
-
-                       ret = __bch2_trans_do(&trans, NULL, NULL,
-                                             BTREE_INSERT_LAZY_RW,
-                                       bch2_alloc_write_key(&trans, &iter,
-                                                            initial, metadata_only));
-                       if (ret)
-                               break;
-               }
-               bch2_trans_iter_exit(&trans, &iter);
-
-               if (ret) {
-                       bch_err(c, "error writing alloc info: %i", ret);
+               ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+                               POS(ca->dev_idx, ca->mi.first_bucket),
+                               BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+                               NULL, NULL, BTREE_INSERT_LAZY_RW,
+                       bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+
+               if (ret < 0) {
+                       bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
                        percpu_ref_put(&ca->ref);
                        break;
                }
        }
 
        bch2_trans_exit(&trans);
-       return ret;
+       return ret < 0 ? ret : 0;
 }
 
-static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
        struct bch_dev *ca;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bucket *g;
+       struct bch_alloc_v4 a;
        unsigned i;
+       int ret;
 
        for_each_member_device(ca, c, i) {
                struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
@@ -1384,119 +1472,147 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_onl
                                GFP_KERNEL|__GFP_ZERO);
                if (!buckets) {
                        percpu_ref_put(&ca->ref);
-                       percpu_up_write(&c->mark_lock);
                        bch_err(c, "error allocating ca->buckets[gc]");
                        return -ENOMEM;
                }
 
                buckets->first_bucket   = ca->mi.first_bucket;
                buckets->nbuckets       = ca->mi.nbuckets;
-               rcu_assign_pointer(ca->buckets[1], buckets);
+               rcu_assign_pointer(ca->buckets_gc, buckets);
        };
 
-       return bch2_alloc_read(c, true, metadata_only);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ca = bch_dev_bkey_exists(c, k.k->p.inode);
+               g = gc_bucket(ca, k.k->p.offset);
+
+               bch2_alloc_to_v4(k, &a);
+
+               g->gen_valid    = 1;
+               g->gen          = a.gen;
+
+               if (metadata_only &&
+                   (a.data_type == BCH_DATA_user ||
+                    a.data_type == BCH_DATA_cached ||
+                    a.data_type == BCH_DATA_parity)) {
+                       g->data_type            = a.data_type;
+                       g->dirty_sectors        = a.dirty_sectors;
+                       g->cached_sectors       = a.cached_sectors;
+                       g->stripe               = a.stripe;
+                       g->stripe_redundancy    = a.stripe_redundancy;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+
+       if (ret)
+               bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
+
+       return ret;
 }
 
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 {
        struct bch_dev *ca;
        unsigned i;
 
        for_each_member_device(ca, c, i) {
-               struct bucket_array *buckets = __bucket_array(ca, true);
+               struct bucket_array *buckets = gc_bucket_array(ca);
                struct bucket *g;
 
                for_each_bucket(g, buckets) {
                        if (metadata_only &&
-                           (g->mark.data_type == BCH_DATA_user ||
-                            g->mark.data_type == BCH_DATA_cached ||
-                            g->mark.data_type == BCH_DATA_parity))
+                           (g->data_type == BCH_DATA_user ||
+                            g->data_type == BCH_DATA_cached ||
+                            g->data_type == BCH_DATA_parity))
                                continue;
-                       g->_mark.dirty_sectors = 0;
-                       g->_mark.cached_sectors = 0;
+                       g->data_type = 0;
+                       g->dirty_sectors = 0;
+                       g->cached_sectors = 0;
                }
        };
 }
 
-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
-                               bool metadata_only)
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+                                    struct btree_iter *iter,
+                                    struct bkey_s_c k,
+                                    size_t *idx)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct bch_fs *c = trans->c;
+       const __le64 *refcount = bkey_refcount_c(k);
+       struct printbuf buf = PRINTBUF;
        struct reflink_gc *r;
-       size_t idx = 0;
-       char buf[200];
        int ret = 0;
 
-       if (metadata_only)
+       if (!refcount)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+              r->offset < k.k->p.offset)
+               ++*idx;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               const __le64 *refcount = bkey_refcount_c(k);
-
-               if (!refcount)
-                       continue;
+       if (!r ||
+           r->offset != k.k->p.offset ||
+           r->size != k.k->size) {
+               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+               return -EINVAL;
+       }
 
-               r = genradix_ptr(&c->reflink_gc_table, idx++);
-               if (!r ||
-                   r->offset != k.k->p.offset ||
-                   r->size != k.k->size) {
-                       bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-                       ret = -EINVAL;
-                       break;
-               }
+       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                       "reflink key has wrong refcount:\n"
+                       "  %s\n"
+                       "  should be %u",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+                       r->refcount)) {
+               struct bkey_i *new;
 
-               if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-                               "reflink key has wrong refcount:\n"
-                               "  %s\n"
-                               "  should be %u",
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-                               r->refcount)) {
-                       struct bkey_i *new;
-
-                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-                       if (!new) {
-                               ret = -ENOMEM;
-                               break;
-                       }
+               new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               ret = PTR_ERR_OR_ZERO(new);
+               if (ret)
+                       return ret;
 
-                       bkey_reassemble(new, k);
-
-                       if (!r->refcount) {
-                               new->k.type = KEY_TYPE_deleted;
-                               /*
-                                * XXX ugly: bch2_journal_key_insert() queues up
-                                * the key for the journal replay code, which
-                                * doesn't run the extent overwrite pass
-                                */
-                               if (initial)
-                                       new->k.size = 0;
-                       } else {
-                               *bkey_refcount(new) = cpu_to_le64(r->refcount);
-                       }
+               bkey_reassemble(new, k);
 
-                       ret = initial
-                              ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
-                              : __bch2_trans_do(&trans, NULL, NULL, 0,
-                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
-                       kfree(new);
+               if (!r->refcount)
+                       new->k.type = KEY_TYPE_deleted;
+               else
+                       *bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-                       if (ret)
-                               break;
-               }
+               ret = bch2_trans_update(trans, iter, new, 0);
        }
 fsck_err:
-       bch2_trans_iter_exit(&trans, &iter);
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       size_t idx = 0;
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_reflink, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL,
+               bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+
        c->reflink_gc_nr = 0;
        bch2_trans_exit(&trans);
        return ret;
 }
 
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+static int bch2_gc_reflink_start(struct bch_fs *c,
                                 bool metadata_only)
 {
        struct btree_trans trans;
@@ -1535,8 +1651,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
        return ret;
 }
 
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
-                                 bool metadata_only)
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
 {
        struct genradix_iter iter;
        struct reflink_gc *r;
@@ -1545,71 +1660,77 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
                r->refcount = 0;
 }
 
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
-                               bool metadata_only)
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+                                    struct btree_iter *iter,
+                                    struct bkey_s_c k)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct gc_stripe *m;
+       struct bch_fs *c = trans->c;
+       struct printbuf buf = PRINTBUF;
        const struct bch_stripe *s;
-       char buf[200];
+       struct gc_stripe *m;
        unsigned i;
        int ret = 0;
 
-       if (metadata_only)
+       if (k.k->type != KEY_TYPE_stripe)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               if (k.k->type != KEY_TYPE_stripe)
-                       continue;
+       s = bkey_s_c_to_stripe(k).v;
+       m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-               s = bkey_s_c_to_stripe(k).v;
-               m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
-               for (i = 0; i < s->nr_blocks; i++)
-                       if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-                               goto inconsistent;
-               continue;
+       for (i = 0; i < s->nr_blocks; i++)
+               if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+                       goto inconsistent;
+       return 0;
 inconsistent:
-               if (fsck_err_on(true, c,
-                               "stripe has wrong block sector count %u:\n"
-                               "  %s\n"
-                               "  should be %u", i,
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-                               m ? m->block_sectors[i] : 0)) {
-                       struct bkey_i_stripe *new;
-
-                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-                       if (!new) {
-                               ret = -ENOMEM;
-                               break;
-                       }
+       if (fsck_err_on(true, c,
+                       "stripe has wrong block sector count %u:\n"
+                       "  %s\n"
+                       "  should be %u", i,
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+                       m ? m->block_sectors[i] : 0)) {
+               struct bkey_i_stripe *new;
+
+               new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               ret = PTR_ERR_OR_ZERO(new);
+               if (ret)
+                       return ret;
 
-                       bkey_reassemble(&new->k_i, k);
+               bkey_reassemble(&new->k_i, k);
 
-                       for (i = 0; i < new->v.nr_blocks; i++)
-                               stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+               for (i = 0; i < new->v.nr_blocks; i++)
+                       stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-                       ret = initial
-                               ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
-                               : __bch2_trans_do(&trans, NULL, NULL, 0,
-                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
-                       kfree(new);
-               }
+               ret = bch2_trans_update(trans, iter, &new->k_i, 0);
        }
 fsck_err:
-       bch2_trans_iter_exit(&trans, &iter);
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_stripes, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL,
+               bch2_gc_write_stripes_key(&trans, &iter, k));
 
        bch2_trans_exit(&trans);
        return ret;
 }
 
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
-                               bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 {
        genradix_free(&c->gc_stripes);
 }
@@ -1634,23 +1755,18 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       struct bch_dev *ca;
-       u64 start_time = local_clock();
-       unsigned i, iter = 0;
+       unsigned iter = 0;
        int ret;
 
        lockdep_assert_held(&c->state_lock);
-       trace_gc_start(c);
 
        down_write(&c->gc_lock);
 
-       /* flush interior btree updates: */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        ret   = bch2_gc_start(c, metadata_only) ?:
-               bch2_gc_alloc_start(c, initial, metadata_only) ?:
-               bch2_gc_reflink_start(c, initial, metadata_only);
+               bch2_gc_alloc_start(c, metadata_only) ?:
+               bch2_gc_reflink_start(c, metadata_only);
        if (ret)
                goto out;
 again:
@@ -1661,26 +1777,27 @@ again:
        if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
            c->opts.fix_errors != FSCK_OPT_NO) {
-               bch_info(c, "starting topology repair pass");
+               bch_info(c, "Starting topology repair pass");
                ret = bch2_repair_topology(c);
                if (ret)
                        goto out;
-               bch_info(c, "topology repair pass done");
+               bch_info(c, "Topology repair pass done");
 
                set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
        }
 
        ret = bch2_gc_btrees(c, initial, metadata_only);
 
-       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+       if (ret == -BCH_ERR_need_topology_repair &&
            !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
                ret = 0;
        }
 
-       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
-               ret = FSCK_ERR_EXIT;
+       if (ret == -BCH_ERR_need_topology_repair)
+               ret = -BCH_ERR_fsck_errors_not_fixed;
 
        if (ret)
                goto out;
@@ -1705,9 +1822,9 @@ again:
                clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
-               bch2_gc_stripes_reset(c, initial, metadata_only);
-               bch2_gc_alloc_reset(c, initial, metadata_only);
-               bch2_gc_reflink_reset(c, initial, metadata_only);
+               bch2_gc_stripes_reset(c, metadata_only);
+               bch2_gc_alloc_reset(c, metadata_only);
+               bch2_gc_reflink_reset(c, metadata_only);
 
                /* flush fsck errors, reset counters */
                bch2_flush_fsck_errs(c);
@@ -1717,9 +1834,9 @@ out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
-               ret   = bch2_gc_stripes_done(c, initial, metadata_only) ?:
-                       bch2_gc_reflink_done(c, initial, metadata_only) ?:
-                       bch2_gc_alloc_done(c, initial, metadata_only) ?:
+               ret   = bch2_gc_stripes_done(c, metadata_only) ?:
+                       bch2_gc_reflink_done(c, metadata_only) ?:
+                       bch2_gc_alloc_done(c, metadata_only) ?:
                        bch2_gc_done(c, initial, metadata_only);
 
                bch2_journal_unblock(&c->journal);
@@ -1734,16 +1851,6 @@ out:
 
        up_write(&c->gc_lock);
 
-       trace_gc_end(c);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
-       /*
-        * Wake up allocator in case it was waiting for buckets
-        * because of not being able to inc gens
-        */
-       for_each_member_device(ca, c, i)
-               bch2_wake_allocator(ca);
-
        /*
         * At startup, allocations can happen directly instead of via the
         * allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1752,10 +1859,15 @@ out:
        return ret;
 }
 
-static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+static int gc_btree_gens_key(struct btree_trans *trans,
+                            struct btree_iter *iter,
+                            struct bkey_s_c k)
 {
+       struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
+       struct bkey_i *u;
+       int ret;
 
        percpu_down_read(&c->mark_lock);
        bkey_for_each_ptr(ptrs, ptr) {
@@ -1763,7 +1875,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
                if (ptr_stale(ca, ptr) > 16) {
                        percpu_up_read(&c->mark_lock);
-                       return true;
+                       goto update;
                }
        }
 
@@ -1775,84 +1887,41 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
                        *gen = ptr->gen;
        }
        percpu_up_read(&c->mark_lock);
+       return 0;
+update:
+       u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+       ret = PTR_ERR_OR_ZERO(u);
+       if (ret)
+               return ret;
 
-       return false;
-}
-
-/*
- * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
- * node pointers currently never have cached pointers that can become stale:
- */
-static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_buf sk;
-       int ret = 0, commit_err = 0;
-
-       bch2_bkey_buf_init(&sk);
-
-       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       while ((bch2_trans_begin(trans),
-               k = bch2_btree_iter_peek(&iter)).k) {
-               ret = bkey_err(k);
-
-               if (ret == -EINTR)
-                       continue;
-               if (ret)
-                       break;
-
-               c->gc_gens_pos = iter.pos;
-
-               if (gc_btree_gens_key(c, k) && !commit_err) {
-                       bch2_bkey_buf_reassemble(&sk, c, k);
-                       bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
-                       commit_err =
-                               bch2_trans_update(trans, &iter, sk.k, 0) ?:
-                               bch2_trans_commit(trans, NULL, NULL,
-                                                 BTREE_INSERT_NOWAIT|
-                                                 BTREE_INSERT_NOFAIL);
-                       if (commit_err == -EINTR) {
-                               commit_err = 0;
-                               continue;
-                       }
-               }
-
-               bch2_btree_iter_advance(&iter);
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       bch2_bkey_buf_exit(&sk, c);
+       bkey_reassemble(u, k);
 
-       return ret;
+       bch2_extent_normalize(c, bkey_i_to_s(u));
+       return bch2_trans_update(trans, iter, u, 0);
 }
 
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+                                      struct bkey_s_c k)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
+       struct bch_alloc_v4 a;
+       struct bkey_i_alloc_v4 *a_mut;
        int ret;
 
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       u = bch2_alloc_unpack(k);
+       bch2_alloc_to_v4(k, &a);
 
-       if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+       if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
                return 0;
 
-       u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+       a_mut = bch2_alloc_to_v4_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(a_mut);
+       if (ret)
+               return ret;
+
+       a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+       a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
 
-       return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+       return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
 
 int bch2_gc_gens(struct bch_fs *c)
@@ -1873,6 +1942,7 @@ int bch2_gc_gens(struct bch_fs *c)
        if (!mutex_trylock(&c->gc_gens_lock))
                return 0;
 
+       trace_and_count(c, gc_gens_start, c);
        down_read(&c->gc_lock);
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -1896,27 +1966,36 @@ int bch2_gc_gens(struct bch_fs *c)
        }
 
        for (i = 0; i < BTREE_ID_NR; i++)
-               if ((1 << i) & BTREE_ID_HAS_PTRS) {
+               if (btree_type_has_ptrs(i)) {
+                       struct btree_iter iter;
+                       struct bkey_s_c k;
+
                        c->gc_gens_btree = i;
                        c->gc_gens_pos = POS_MIN;
-                       ret = bch2_gc_btree_gens(&trans, i);
-                       if (ret) {
-                               bch_err(c, "error recalculating oldest_gen: %i", ret);
+                       ret = for_each_btree_key_commit(&trans, iter, i,
+                                       POS_MIN,
+                                       BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+                                       k,
+                                       NULL, NULL,
+                                       BTREE_INSERT_NOFAIL,
+                               gc_btree_gens_key(&trans, &iter, k));
+                       if (ret && ret != -EROFS)
+                               bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+                       if (ret)
                                goto err;
-                       }
                }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL,
-                               bch2_alloc_write_oldest_gen(&trans, &iter));
-               if (ret) {
-                       bch_err(c, "error writing oldest_gen: %i", ret);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+                       POS_MIN,
+                       BTREE_ITER_PREFETCH,
+                       k,
+                       NULL, NULL,
+                       BTREE_INSERT_NOFAIL,
+               bch2_alloc_write_oldest_gen(&trans, &iter, k));
+       if (ret && ret != -EROFS)
+               bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+       if (ret)
+               goto err;
 
        c->gc_gens_btree        = 0;
        c->gc_gens_pos          = POS_MIN;
@@ -1924,6 +2003,7 @@ int bch2_gc_gens(struct bch_fs *c)
        c->gc_count++;
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+       trace_and_count(c, gc_gens_end, c);
 err:
        for_each_member_device(ca, c, i) {
                kvfree(ca->oldest_gen);
@@ -1985,7 +2065,7 @@ static int bch2_gc_thread(void *arg)
                ret = bch2_gc_gens(c);
 #endif
                if (ret < 0)
-                       bch_err(c, "btree gc failed: %i", ret);
+                       bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
 
                debug_check_no_locks_held();
        }
@@ -2015,7 +2095,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
        p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
        if (IS_ERR(p)) {
-               bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
+               bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
                return PTR_ERR(p);
        }
 
index 0665f5941fcc5a6196c4a80b8e7c6f5479055ca6..95d803b5743de5bb9e8bc5d58a868e9b795c27cf 100644 (file)
@@ -102,4 +102,11 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
        return ret;
 }
 
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+       atomic_inc(&c->kick_gc);
+       if (c->gc_thread)
+               wake_up_process(c->gc_thread);
+}
+
 #endif /* _BCACHEFS_BTREE_GC_H */
index a3651325a02209672ce5375421a1a3717c992782..dd6b536ced6a918ebd1152da352d05609c9658a6 100644 (file)
@@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
                };
 
                if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-                       bch2_btree_node_write(c, b, SIX_LOCK_write);
+                       bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
                        reinit_iter = true;
                }
        }
@@ -501,7 +501,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
                          struct btree *b)
 {
-       pr_buf(out, "%s level %u/%u\n  ",
+       prt_printf(out, "%s level %u/%u\n  ",
               bch2_btree_ids[b->c.btree_id],
               b->c.level,
               c->btree_roots[b->c.btree_id].level);
@@ -513,17 +513,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
                          struct btree *b, struct bset *i,
                          unsigned offset, int write)
 {
-       pr_buf(out, "error validating btree node ");
-       if (write)
-               pr_buf(out, "before write ");
+       prt_printf(out, bch2_log_msg(c, ""));
+       if (!write)
+               prt_str(out, "error validating btree node ");
+       else
+               prt_str(out, "corrupt btree node before write ");
        if (ca)
-               pr_buf(out, "on %s ", ca->name);
-       pr_buf(out, "at btree ");
+               prt_printf(out, "on %s ", ca->name);
+       prt_printf(out, "at btree ");
        btree_pos_to_text(out, c, b);
 
-       pr_buf(out, "\n  node offset %u", b->written);
+       prt_printf(out, "\n  node offset %u", b->written);
        if (i)
-               pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+               prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+       prt_str(out, ": ");
 }
 
 enum btree_err_type {
@@ -540,32 +543,25 @@ enum btree_validate_ret {
 #define btree_err(type, c, ca, b, i, msg, ...)                         \
 ({                                                                     \
        __label__ out;                                                  \
-       char _buf[300];                                                 \
-       char *_buf2 = _buf;                                             \
-       struct printbuf out = PBUF(_buf);                               \
-                                                                       \
-       _buf2 = kmalloc(4096, GFP_ATOMIC);                              \
-       if (_buf2)                                                      \
-               out = _PBUF(_buf2, 4986);                               \
+       struct printbuf out = PRINTBUF;                                 \
                                                                        \
        btree_err_msg(&out, c, ca, b, i, b->written, write);            \
-       pr_buf(&out, ": " msg, ##__VA_ARGS__);                          \
+       prt_printf(&out, msg, ##__VA_ARGS__);                           \
                                                                        \
        if (type == BTREE_ERR_FIXABLE &&                                \
            write == READ &&                                            \
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
-               mustfix_fsck_err(c, "%s", _buf2);                       \
+               mustfix_fsck_err(c, "%s", out.buf);                     \
                goto out;                                               \
        }                                                               \
                                                                        \
+       bch2_print_string_as_lines(KERN_ERR, out.buf);                  \
+                                                                       \
        switch (write) {                                                \
        case READ:                                                      \
-               if (_buf2)                                              \
-                       bch_err(c, "%s", _buf2);                        \
-                                                                       \
                switch (type) {                                         \
                case BTREE_ERR_FIXABLE:                                 \
-                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
                case BTREE_ERR_WANT_RETRY:                              \
                        if (have_retry) {                               \
@@ -577,22 +573,19 @@ enum btree_validate_ret {
                        ret = BTREE_RETRY_READ;                         \
                        goto fsck_err;                                  \
                case BTREE_ERR_FATAL:                                   \
-                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
                }                                                       \
                break;                                                  \
        case WRITE:                                                     \
-               bch_err(c, "corrupt metadata before write: %s", _buf2); \
-                                                                       \
                if (bch2_fs_inconsistent(c)) {                          \
-                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
                }                                                       \
                break;                                                  \
        }                                                               \
 out:                                                                   \
-       if (_buf2 != _buf)                                              \
-               kfree(_buf2);                                           \
+       printbuf_exit(&out);                                            \
        true;                                                           \
 })
 
@@ -624,7 +617,6 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
                                          (u64 *) vstruct_end(i) - (u64 *) k);
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
                        set_btree_bset_end(b, t);
-                       bch2_bset_set_no_aux_tree(b, t);
                }
 
                for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
@@ -634,10 +626,14 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
                if (k != vstruct_last(i)) {
                        i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
                        set_btree_bset_end(b, t);
-                       bch2_bset_set_no_aux_tree(b, t);
                }
        }
 
+       /*
+        * Always rebuild search trees: eytzinger search tree nodes directly
+        * depend on the values of min/max key:
+        */
+       bch2_bset_set_no_aux_tree(b, b->set);
        bch2_btree_build_aux_trees(b);
 
        for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
@@ -653,8 +649,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 {
        unsigned version = le16_to_cpu(i->version);
        const char *err;
-       char buf1[100];
-       char buf2[100];
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
        int ret = 0;
 
        btree_err_on((version != BCH_BSET_VERSION_OLD &&
@@ -691,7 +687,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         BTREE_ERR_FIXABLE, c, ca, b, i,
                         "bset past end of btree node")) {
                i->u64s = 0;
-               return 0;
+               ret = 0;
+               goto out;
        }
 
        btree_err_on(offset && !i->u64s,
@@ -742,14 +739,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                        btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
                                     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                                     "incorrect min_key: got %s should be %s",
-                                    (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
-                                    (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
+                                    (printbuf_reset(&buf1),
+                                     bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+                                    (printbuf_reset(&buf2),
+                                     bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
                }
 
                btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
                             BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect max key %s",
-                            (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
+                            (printbuf_reset(&buf1),
+                             bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
 
                if (write)
                        compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -764,16 +764,29 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                               BSET_BIG_ENDIAN(i), write,
                               &bn->format);
        }
+out:
 fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
        return ret;
 }
 
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+                           struct bkey_s_c k,
+                           bool updated_range, int rw,
+                           struct printbuf *err)
+{
+       return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+               (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+               (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-                        struct bset *i, unsigned *whiteout_u64s,
-                        int write, bool have_retry)
+                        struct bset *i, int write, bool have_retry)
 {
        unsigned version = le16_to_cpu(i->version);
        struct bkey_packed *k, *prev = NULL;
+       struct printbuf buf = PRINTBUF;
        bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
                BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        int ret = 0;
@@ -782,7 +795,6 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
             k != vstruct_last(i);) {
                struct bkey_s u;
                struct bkey tmp;
-               const char *invalid;
 
                if (btree_err_on(bkey_next(k) > vstruct_last(i),
                                 BTREE_ERR_FIXABLE, c, NULL, b, i,
@@ -808,15 +820,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
                u = __bkey_disassemble(b, k, &tmp);
 
-               invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
-                       (!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
-                       (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
-               if (invalid) {
-                       char buf[160];
+               printbuf_reset(&buf);
+               if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+                       printbuf_reset(&buf);
+                       prt_printf(&buf, "invalid bkey:  ");
+                       bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+                       prt_printf(&buf, "\n  ");
+                       bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-                                 "invalid bkey: %s\n%s", invalid, buf);
+                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
@@ -830,18 +842,17 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                                    &b->format, k);
 
                if (prev && bkey_iter_cmp(b, prev, k) > 0) {
-                       char buf1[80];
-                       char buf2[80];
                        struct bkey up = bkey_unpack_key(b, prev);
 
-                       bch2_bkey_to_text(&PBUF(buf1), &up);
-                       bch2_bkey_to_text(&PBUF(buf2), u.k);
+                       printbuf_reset(&buf);
+                       prt_printf(&buf, "keys out of order: ");
+                       bch2_bkey_to_text(&buf, &up);
+                       prt_printf(&buf, " > ");
+                       bch2_bkey_to_text(&buf, u.k);
 
                        bch2_dump_bset(c, b, i, 0);
 
-                       if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-                                     "keys out of order: %s > %s",
-                                     buf1, buf2)) {
+                       if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
                                i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                                memmove_u64s_down(k, bkey_next(k),
                                                  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -853,6 +864,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                k = bkey_next(k);
        }
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -871,9 +883,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        unsigned u64s;
        unsigned blacklisted_written, nonblacklisted_written = 0;
        unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+       struct printbuf buf = PRINTBUF;
        int ret, retry_read = 0, write = READ;
 
        b->version_ondisk = U16_MAX;
+       /* We might get called multiple times on read retry: */
+       b->written = 0;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
        sort_iter_init(iter, b);
@@ -885,11 +900,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
        btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
                     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-                    "bad magic");
+                    "bad magic: want %llx, got %llx",
+                    bset_magic(c), le64_to_cpu(b->data->magic));
 
        btree_err_on(!b->data->keys.seq,
                     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-                    "bad btree header");
+                    "bad btree header: seq 0");
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
                struct bch_btree_ptr_v2 *bp =
@@ -902,7 +918,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        }
 
        while (b->written < (ptr_written ?: btree_sectors(c))) {
-               unsigned sectors, whiteout_u64s = 0;
+               unsigned sectors;
                struct nonce nonce;
                struct bch_csum csum;
                bool first = !b->written;
@@ -922,9 +938,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                     BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "invalid checksum");
 
-                       bset_encrypt(c, i, b->written << 9);
+                       ret = bset_encrypt(c, i, b->written << 9);
+                       if (bch2_fs_fatal_err_on(ret, c,
+                                       "error decrypting btree node: %i", ret))
+                               goto fsck_err;
 
-                       btree_err_on(btree_node_is_extents(b) &&
+                       btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
                                     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
                                     BTREE_ERR_FATAL, c, NULL, b, NULL,
                                     "btree node does not have NEW_EXTENT_OVERWRITE set");
@@ -949,7 +968,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                     BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "invalid checksum");
 
-                       bset_encrypt(c, i, b->written << 9);
+                       ret = bset_encrypt(c, i, b->written << 9);
+                       if (bch2_fs_fatal_err_on(ret, c,
+                                       "error decrypting btree node: %i\n", ret))
+                               goto fsck_err;
 
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
@@ -965,8 +987,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                if (!b->written)
                        btree_node_set_format(b, b->data->format);
 
-               ret = validate_bset_keys(c, b, i, &whiteout_u64s,
-                                   READ, have_retry);
+               ret = validate_bset_keys(c, b, i, READ, have_retry);
                if (ret)
                        goto fsck_err;
 
@@ -992,11 +1013,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                if (blacklisted && !first)
                        continue;
 
-               sort_iter_add(iter, i->start,
-                             vstruct_idx(i, whiteout_u64s));
-
                sort_iter_add(iter,
-                             vstruct_idx(i, whiteout_u64s),
+                             vstruct_idx(i, 0),
                              vstruct_last(i));
 
                nonblacklisted_written = b->written;
@@ -1056,16 +1074,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        for (k = i->start; k != vstruct_last(i);) {
                struct bkey tmp;
                struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-               const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
-               if (invalid ||
+               printbuf_reset(&buf);
+
+               if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
                    (bch2_inject_invalid_keys &&
                     !bversion_cmp(u.k->version, MAX_VERSION))) {
-                       char buf[160];
+                       printbuf_reset(&buf);
+
+                       prt_printf(&buf, "invalid bkey: ");
+                       bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+                       prt_printf(&buf, "\n  ");
+                       bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-                                 "invalid bkey %s: %s", buf, invalid);
+                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
                        btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1102,6 +1124,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                set_btree_node_need_rewrite(b);
 out:
        mempool_free(iter, &c->fill_iter);
+       printbuf_exit(&buf);
        return retry_read;
 fsck_err:
        if (ret == BTREE_RETRY_READ) {
@@ -1122,18 +1145,18 @@ static void btree_node_read_work(struct work_struct *work)
        struct bch_dev *ca      = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
        struct bio *bio         = &rb->bio;
        struct bch_io_failures failed = { .nr = 0 };
-       char buf[200];
-       struct printbuf out;
+       struct printbuf buf = PRINTBUF;
        bool saw_error = false;
+       bool retry = false;
        bool can_retry;
 
        goto start;
        while (1) {
+               retry = true;
                bch_info(c, "retrying read");
                ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
                rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
-               bio_reset(bio);
-               bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
+               bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
                bio->bi_iter.bi_size    = btree_bytes(c);
 
@@ -1144,10 +1167,10 @@ static void btree_node_read_work(struct work_struct *work)
                        bio->bi_status = BLK_STS_REMOVED;
                }
 start:
-               out = PBUF(buf);
-               btree_pos_to_text(&out, c, b);
+               printbuf_reset(&buf);
+               btree_pos_to_text(&buf, c, b);
                bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
-                                  bch2_blk_status_to_str(bio->bi_status), buf);
+                                  bch2_blk_status_to_str(bio->bi_status), buf.buf);
                if (rb->have_ioref)
                        percpu_ref_put(&ca->io_ref);
                rb->have_ioref = false;
@@ -1159,8 +1182,11 @@ start:
                                &failed, &rb->pick) > 0;
 
                if (!bio->bi_status &&
-                   !bch2_btree_node_read_done(c, ca, b, can_retry))
+                   !bch2_btree_node_read_done(c, ca, b, can_retry)) {
+                       if (retry)
+                               bch_info(c, "retry success");
                        break;
+               }
 
                saw_error = true;
 
@@ -1173,6 +1199,7 @@ start:
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
                               rb->start_time);
        bio_put(&rb->bio);
+       printbuf_exit(&buf);
 
        if (saw_error && !btree_node_read_error(b))
                bch2_btree_node_rewrite_async(c, b);
@@ -1253,6 +1280,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
                container_of(cl, struct btree_node_read_all, cl);
        struct bch_fs *c = ra->c;
        struct btree *b = ra->b;
+       struct printbuf buf = PRINTBUF;
        bool dump_bset_maps = false;
        bool have_retry = false;
        int ret = 0, best = -1, write = READ;
@@ -1296,8 +1324,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 fsck_err:
        if (dump_bset_maps) {
                for (i = 0; i < ra->nr; i++) {
-                       char buf[200];
-                       struct printbuf out = PBUF(buf);
                        struct btree_node *bn = ra->buf[i];
                        struct btree_node_entry *bne = NULL;
                        unsigned offset = 0, sectors;
@@ -1306,6 +1332,8 @@ fsck_err:
                        if (ra->err[i])
                                continue;
 
+                       printbuf_reset(&buf);
+
                        while (offset < btree_sectors(c)) {
                                if (!offset) {
                                        sectors = vstruct_sectors(bn, c->block_bits);
@@ -1316,10 +1344,10 @@ fsck_err:
                                        sectors = vstruct_sectors(bne, c->block_bits);
                                }
 
-                               pr_buf(&out, " %u-%u", offset, offset + sectors);
+                               prt_printf(&buf, " %u-%u", offset, offset + sectors);
                                if (bne && bch2_journal_seq_is_blacklisted(c,
                                                        le64_to_cpu(bne->keys.journal_seq), false))
-                                       pr_buf(&out, "*");
+                                       prt_printf(&buf, "*");
                                offset += sectors;
                        }
 
@@ -1327,19 +1355,19 @@ fsck_err:
                                bne = ra->buf[i] + (offset << 9);
                                if (bne->keys.seq == bn->keys.seq) {
                                        if (!gap)
-                                               pr_buf(&out, " GAP");
+                                               prt_printf(&buf, " GAP");
                                        gap = true;
 
                                        sectors = vstruct_sectors(bne, c->block_bits);
-                                       pr_buf(&out, " %u-%u", offset, offset + sectors);
+                                       prt_printf(&buf, " %u-%u", offset, offset + sectors);
                                        if (bch2_journal_seq_is_blacklisted(c,
                                                        le64_to_cpu(bne->keys.journal_seq), false))
-                                               pr_buf(&out, "*");
+                                               prt_printf(&buf, "*");
                                }
                                offset++;
                        }
 
-                       bch_err(c, "replica %u:%s", i, buf);
+                       bch_err(c, "replica %u:%s", i, buf.buf);
                }
        }
 
@@ -1360,6 +1388,7 @@ fsck_err:
 
        closure_debug_destroy(&ra->cl);
        kfree(ra);
+       printbuf_exit(&buf);
 
        clear_btree_node_read_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1405,8 +1434,10 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
        for (i = 0; i < ra->nr; i++) {
                ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-               ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i],
-                                                                 btree_bytes(c)),
+               ra->bio[i] = bio_alloc_bioset(NULL,
+                                             buf_pages(ra->buf[i], btree_bytes(c)),
+                                             REQ_OP_READ|REQ_SYNC|REQ_META,
+                                             GFP_NOFS,
                                              &c->btree_bio);
        }
 
@@ -1422,7 +1453,6 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
                rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
                rb->idx                 = i;
                rb->pick                = pick;
-               rb->bio.bi_opf          = REQ_OP_READ|REQ_SYNC|REQ_META;
                rb->bio.bi_iter.bi_sector = pick.ptr.offset;
                rb->bio.bi_end_io       = btree_node_read_all_replicas_endio;
                bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
@@ -1459,11 +1489,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
        struct btree_read_bio *rb;
        struct bch_dev *ca;
        struct bio *bio;
-       char buf[200];
        int ret;
 
-       btree_pos_to_text(&PBUF(buf), c, b);
-       trace_btree_read(c, b);
+       trace_and_count(c, btree_node_read, c, b);
 
        if (bch2_verify_all_btree_replicas &&
            !btree_node_read_all_replicas(c, b, sync))
@@ -1471,17 +1499,30 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 
        ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
                                         NULL, &pick);
-       if (bch2_fs_fatal_err_on(ret <= 0, c,
-                       "btree node read error: no device to read from\n"
-                       " at %s", buf)) {
+
+       if (ret <= 0) {
+               struct printbuf buf = PRINTBUF;
+
+               prt_str(&buf, "btree node read error: no device to read from\n at ");
+               btree_pos_to_text(&buf, c, b);
+               bch_err(c, "%s", buf.buf);
+
+               if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+                       bch2_fatal_error(c);
+
                set_btree_node_read_error(b);
+               clear_btree_node_read_in_flight(b);
+               wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+               printbuf_exit(&buf);
                return;
        }
 
        ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 
-       bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data,
-                                                  btree_bytes(c)),
+       bio = bio_alloc_bioset(NULL,
+                              buf_pages(b->data, btree_bytes(c)),
+                              REQ_OP_READ|REQ_SYNC|REQ_META,
+                              GFP_NOIO,
                               &c->btree_bio);
        rb = container_of(bio, struct btree_read_bio, bio);
        rb->c                   = c;
@@ -1491,7 +1532,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
        rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
        rb->pick                = pick;
        INIT_WORK(&rb->work, btree_node_read_work);
-       bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bio->bi_end_io          = btree_node_read_endio;
        bch2_bio_map(bio, b->data, btree_bytes(c));
@@ -1532,7 +1572,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
                closure_sync(&cl);
        } while (ret);
 
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, level != 0);
        bch2_btree_cache_cannibalize_unlock(c);
 
        BUG_ON(IS_ERR(b));
@@ -1582,29 +1622,13 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
        bch2_journal_pin_drop(&c->journal, &w->journal);
 }
 
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
        unsigned long old, new, v;
 
        bch2_btree_complete_write(c, b, w);
 
-       v = READ_ONCE(b->flags);
-       do {
-               old = new = v;
-
-               if (old & (1U << BTREE_NODE_need_write))
-                       goto do_write;
-
-               new &= ~(1U << BTREE_NODE_write_in_flight);
-               new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-       } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-       return;
-
-do_write:
-       six_lock_read(&b->c.lock, NULL, NULL);
        v = READ_ONCE(b->flags);
        do {
                old = new = v;
@@ -1612,7 +1636,8 @@ do_write:
                if ((old & (1U << BTREE_NODE_dirty)) &&
                    (old & (1U << BTREE_NODE_need_write)) &&
                    !(old & (1U << BTREE_NODE_never_write)) &&
-                   btree_node_may_write(b)) {
+                   !(old & (1U << BTREE_NODE_write_blocked)) &&
+                   !(old & (1U << BTREE_NODE_will_make_reachable))) {
                        new &= ~(1U << BTREE_NODE_dirty);
                        new &= ~(1U << BTREE_NODE_need_write);
                        new |=  (1U << BTREE_NODE_write_in_flight);
@@ -1626,9 +1651,22 @@ do_write:
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
        if (new & (1U << BTREE_NODE_write_in_flight))
-               __bch2_btree_node_write(c, b, true);
+               __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+       else
+               wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+       struct btree_trans trans;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
+       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+       __btree_node_write_done(c, b);
        six_unlock_read(&b->c.lock);
+
+       bch2_trans_exit(&trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1712,13 +1750,19 @@ static void btree_node_write_endio(struct bio *bio)
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
                                   struct bset *i, unsigned sectors)
 {
-       unsigned whiteout_u64s = 0;
+       struct printbuf buf = PRINTBUF;
        int ret;
 
-       if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
-               return -1;
+       ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+                               BKEY_TYPE_btree, WRITE, &buf);
+
+       if (ret)
+               bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+       printbuf_exit(&buf);
+       if (ret)
+               return ret;
 
-       ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+       ret = validate_bset_keys(c, b, i, WRITE, false) ?:
                validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
        if (ret) {
                bch2_inconsistent_error(c);
@@ -1742,7 +1786,7 @@ static void btree_write_submit(struct work_struct *work)
        bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 {
        struct btree_write_bio *wbio;
        struct bset_tree *t;
@@ -1757,13 +1801,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
        unsigned long old, new;
        bool validate_before_checksum = false;
        void *data;
+       int ret;
 
-       if (already_started)
+       if (flags & BTREE_WRITE_ALREADY_STARTED)
                goto do_write;
 
-       if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-               return;
-
        /*
         * We may only have a read lock on the btree node - the dirty bit is our
         * "lock" against racing with other threads that may be trying to start
@@ -1777,13 +1819,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
                if (!(old & (1 << BTREE_NODE_dirty)))
                        return;
 
-               if (!btree_node_may_write(b))
+               if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+                   !(old & (1 << BTREE_NODE_need_write)))
+                       return;
+
+               if (old &
+                   ((1 << BTREE_NODE_never_write)|
+                    (1 << BTREE_NODE_write_blocked)))
                        return;
 
-               if (old & (1 << BTREE_NODE_never_write))
+               if (b->written &&
+                   (old & (1 << BTREE_NODE_will_make_reachable)))
                        return;
 
-               BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+               if (old & (1 << BTREE_NODE_write_in_flight))
+                       return;
 
                new &= ~(1 << BTREE_NODE_dirty);
                new &= ~(1 << BTREE_NODE_need_write);
@@ -1863,6 +1913,8 @@ do_write:
        u64s = bch2_sort_keys(i->start, &sort_iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
+       BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
        set_needs_whiteout(i, false);
 
        /* do we have data to write? */
@@ -1872,6 +1924,10 @@ do_write:
        bytes_to_write = vstruct_end(i) - data;
        sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
 
+       if (!b->written &&
+           b->key.k.type == KEY_TYPE_btree_ptr_v2)
+               BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
        memset(data + bytes_to_write, 0,
               (sectors_to_write << 9) - bytes_to_write);
 
@@ -1879,7 +1935,7 @@ do_write:
        BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
        BUG_ON(i->seq != b->data->keys.seq);
 
-       i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+       i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
                ? cpu_to_le16(BCH_BSET_VERSION_OLD)
                : cpu_to_le16(c->sb.version);
        SET_BSET_OFFSET(i, b->written);
@@ -1897,7 +1953,10 @@ do_write:
            validate_bset_for_write(c, b, i, sectors_to_write))
                goto err;
 
-       bset_encrypt(c, i, b->written << 9);
+       ret = bset_encrypt(c, i, b->written << 9);
+       if (bch2_fs_fatal_err_on(ret, c,
+                       "error encrypting btree node: %i\n", ret))
+               goto err;
 
        nonce = btree_nonce(i, b->written << 9);
 
@@ -1933,10 +1992,12 @@ do_write:
            c->opts.nochanges)
                goto err;
 
-       trace_btree_write(b, bytes_to_write, sectors_to_write);
+       trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
 
-       wbio = container_of(bio_alloc_bioset(GFP_NOIO,
+       wbio = container_of(bio_alloc_bioset(NULL,
                                buf_pages(data, sectors_to_write << 9),
+                               REQ_OP_WRITE|REQ_META,
+                               GFP_NOIO,
                                &c->btree_bio),
                            struct btree_write_bio, wbio.bio);
        wbio_init(&wbio->wbio.bio);
@@ -1946,7 +2007,6 @@ do_write:
        wbio->wbio.c                    = c;
        wbio->wbio.used_mempool         = used_mempool;
        wbio->wbio.first_btree_write    = !b->written;
-       wbio->wbio.bio.bi_opf           = REQ_OP_WRITE|REQ_META;
        wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
        wbio->wbio.bio.bi_private       = b;
 
@@ -1956,11 +2016,6 @@ do_write:
 
        b->written += sectors_to_write;
 
-       if (wbio->wbio.first_btree_write &&
-           b->key.k.type == KEY_TYPE_btree_ptr_v2)
-               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-                       cpu_to_le16(b->written);
-
        if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
                bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
                        cpu_to_le16(b->written);
@@ -1973,14 +2028,10 @@ do_write:
        return;
 err:
        set_btree_node_noevict(b);
-       if (!b->written &&
-           b->key.k.type == KEY_TYPE_btree_ptr_v2)
-               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-                       cpu_to_le16(sectors_to_write);
        b->written += sectors_to_write;
 nowrite:
        btree_bounce_free(c, bytes, used_mempool, data);
-       btree_node_write_done(c, b);
+       __btree_node_write_done(c, b);
 }
 
 /*
@@ -2043,12 +2094,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-                          enum six_lock_type lock_type_held)
+                          enum six_lock_type lock_type_held,
+                          unsigned flags)
 {
        if (lock_type_held == SIX_LOCK_intent ||
            (lock_type_held == SIX_LOCK_read &&
             six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, flags);
 
                /* don't cycle lock unnecessarily: */
                if (btree_node_just_written(b) &&
@@ -2060,64 +2112,40 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (lock_type_held == SIX_LOCK_read)
                        six_lock_downgrade(&b->c.lock);
        } else {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, flags);
                if (lock_type_held == SIX_LOCK_write &&
                    btree_node_just_written(b))
                        bch2_btree_post_write_cleanup(c, b);
        }
 }
 
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
        struct bucket_table *tbl;
        struct rhash_head *pos;
        struct btree *b;
        unsigned i;
+       bool ret = false;
 restart:
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos)
                if (test_bit(flag, &b->flags)) {
                        rcu_read_unlock();
                        wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+                       ret = true;
                        goto restart;
-
                }
        rcu_read_unlock();
-}
 
-void bch2_btree_flush_all_reads(struct bch_fs *c)
-{
-       __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+       return ret;
 }
 
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
 {
-       __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+       return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
 }
 
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
-       struct bucket_table *tbl;
-       struct rhash_head *pos;
-       struct btree *b;
-       unsigned i;
-
-       rcu_read_lock();
-       for_each_cached_btree(b, c, tbl, i, pos) {
-               unsigned long flags = READ_ONCE(b->flags);
-
-               if (!(flags & (1 << BTREE_NODE_dirty)))
-                       continue;
-
-               pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
-                      b,
-                      (flags & (1 << BTREE_NODE_dirty)) != 0,
-                      (flags & (1 << BTREE_NODE_need_write)) != 0,
-                      b->c.level,
-                      b->written,
-                      !list_empty_careful(&b->write_blocked),
-                      b->will_make_reachable != 0,
-                      b->will_make_reachable & 1);
-       }
-       rcu_read_unlock();
+       return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
index 0f20224e2a77cec3070850226ea52cd45ebb3695..8af853642123df33276aad4cf1bad547001e7e6a 100644 (file)
@@ -15,18 +15,13 @@ struct btree;
 struct btree_iter;
 struct btree_node_read_all;
 
-static inline bool btree_node_dirty(struct btree *b)
-{
-       return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
        if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
                atomic_inc(&c->btree_cache.dirty);
 }
 
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
        if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
                atomic_dec(&c->btree_cache.dirty);
@@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *);
 void bch2_btree_node_wait_on_read(struct btree *);
 void bch2_btree_node_wait_on_write(struct btree *);
 
-static inline bool btree_node_may_write(struct btree *b)
-{
-       return list_empty_careful(&b->write_blocked) &&
-               (!b->written || !b->will_make_reachable);
-}
-
 enum compact_mode {
        COMPACT_LAZY,
        COMPACT_ALL,
@@ -111,22 +100,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
        }};
 }
 
-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 {
        struct nonce nonce = btree_nonce(i, offset);
+       int ret;
 
        if (!offset) {
                struct btree_node *bn = container_of(i, struct btree_node, keys);
                unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
 
-               bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-                            bytes);
+               ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+                                  &bn->flags, bytes);
+               if (ret)
+                       return ret;
 
                nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
        }
 
-       bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-                    vstruct_end(i) - (void *) i->_data);
+       return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+                           vstruct_end(i) - (void *) i->_data);
 }
 
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
@@ -145,41 +137,23 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
+#define BTREE_WRITE_ONLY_IF_NEED       (1U << 0)
+#define BTREE_WRITE_ALREADY_STARTED    (1U << 1)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
-                         enum six_lock_type);
+                          enum six_lock_type, unsigned);
 
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
                                            enum six_lock_type lock_held)
 {
-       if (b->written &&
-           btree_node_need_write(b) &&
-           btree_node_may_write(b) &&
-           !btree_node_write_in_flight(b))
-               bch2_btree_node_write(c, b, lock_held);
+       bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-#define bch2_btree_node_write_cond(_c, _b, cond)                       \
-do {                                                                   \
-       unsigned long old, new, v = READ_ONCE((_b)->flags);             \
-                                                                       \
-       do {                                                            \
-               old = new = v;                                          \
-                                                                       \
-               if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))        \
-                       break;                                          \
-                                                                       \
-               new |= (1 << BTREE_NODE_need_write);                    \
-       } while ((v = cmpxchg(&(_b)->flags, old, new)) != old);         \
-                                                                       \
-       btree_node_write_if_need(_c, _b, SIX_LOCK_read);                \
-} while (0)
-
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                                  unsigned version, unsigned big_endian,
index ae63ecbc19548a66f8fd9396eca41e650dc02ec5..d18346a5d58d066bafd86509c2aeeb61d6ab41ed 100644 (file)
 #include "replicas.h"
 #include "subvolume.h"
 
+#include <linux/prandom.h>
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
 static void btree_trans_verify_sorted(struct btree_trans *);
-static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
+                                                  struct btree_path *, int);
 
 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
@@ -46,7 +49,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
        if (need_resched() || race_fault()) {
                bch2_trans_unlock(trans);
                schedule();
-               return bch2_trans_relock(trans) ? 0 : -EINTR;
+               return bch2_trans_relock(trans);
        } else {
                return 0;
        }
@@ -99,12 +102,6 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
        return p;
 }
 
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
-       return l < BTREE_MAX_DEPTH &&
-               (unsigned long) path->l[l].b >= 128;
-}
-
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 {
        struct bpos pos = iter->pos;
@@ -135,432 +132,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
                !btree_path_pos_after_node(path, b);
 }
 
-/* Btree node locking: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-                       struct btree_path *path, struct btree *b)
-{
-       bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
-{
-       struct btree_path *linked;
-       unsigned readers = 0;
-
-       trans_for_each_path(trans, linked)
-               if (linked->l[b->c.level].b == b &&
-                   btree_node_read_locked(linked, b->c.level))
-                       readers++;
-
-       /*
-        * Must drop our read locks before calling six_lock_write() -
-        * six_unlock() won't do wakeups until the reader count
-        * goes to 0, and it's safe because we have the node intent
-        * locked:
-        */
-       if (!b->c.lock.readers)
-               atomic64_sub(__SIX_VAL(read_lock, readers),
-                            &b->c.lock.state.counter);
-       else
-               this_cpu_sub(*b->c.lock.readers, readers);
-
-       six_lock_write(&b->c.lock, NULL, NULL);
-
-       if (!b->c.lock.readers)
-               atomic64_add(__SIX_VAL(read_lock, readers),
-                            &b->c.lock.state.counter);
-       else
-               this_cpu_add(*b->c.lock.readers, readers);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-                             struct btree_path *path, unsigned level)
-{
-       struct btree *b = btree_path_node(path, level);
-       int want = __btree_lock_want(path, level);
-
-       if (!is_btree_node(path, level))
-               goto fail;
-
-       if (race_fault())
-               goto fail;
-
-       if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-           (btree_node_lock_seq_matches(path, b, level) &&
-            btree_node_lock_increment(trans, b, level, want))) {
-               mark_btree_node_locked(path, level, want);
-               return true;
-       }
-fail:
-       trace_btree_node_relock_fail(trans->fn, _RET_IP_,
-                                    path->btree_id,
-                                    &path->pos,
-                                    (unsigned long) b,
-                                    path->l[level].lock_seq,
-                                    is_btree_node(path, level) ? b->c.lock.state.seq : 0);
-       return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-                            struct btree_path *path, unsigned level)
-{
-       struct btree *b = path->l[level].b;
-
-       if (!is_btree_node(path, level))
-               return false;
-
-       switch (btree_lock_want(path, level)) {
-       case BTREE_NODE_UNLOCKED:
-               BUG_ON(btree_node_locked(path, level));
-               return true;
-       case BTREE_NODE_READ_LOCKED:
-               BUG_ON(btree_node_intent_locked(path, level));
-               return bch2_btree_node_relock(trans, path, level);
-       case BTREE_NODE_INTENT_LOCKED:
-               break;
-       }
-
-       if (btree_node_intent_locked(path, level))
-               return true;
-
-       if (race_fault())
-               return false;
-
-       if (btree_node_locked(path, level)
-           ? six_lock_tryupgrade(&b->c.lock)
-           : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-               goto success;
-
-       if (btree_node_lock_seq_matches(path, b, level) &&
-           btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-               btree_node_unlock(path, level);
-               goto success;
-       }
-
-       return false;
-success:
-       mark_btree_node_intent_locked(path, level);
-       return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
-                                       struct btree_path *path,
-                                       bool upgrade)
-{
-       unsigned l = path->level;
-       int fail_idx = -1;
-
-       do {
-               if (!btree_path_node(path, l))
-                       break;
-
-               if (!(upgrade
-                     ? bch2_btree_node_upgrade(trans, path, l)
-                     : bch2_btree_node_relock(trans, path, l)))
-                       fail_idx = l;
-
-               l++;
-       } while (l < path->locks_want);
-
-       /*
-        * When we fail to get a lock, we have to ensure that any child nodes
-        * can't be relocked so bch2_btree_path_traverse has to walk back up to
-        * the node that we failed to relock:
-        */
-       if (fail_idx >= 0) {
-               __bch2_btree_path_unlock(path);
-               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
-               do {
-                       path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-                       --fail_idx;
-               } while (fail_idx >= 0);
-       }
-
-       if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-               path->uptodate = BTREE_ITER_UPTODATE;
-
-       bch2_trans_verify_locks(trans);
-
-       return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
-                                 bool cached)
-{
-       return !cached
-               ? container_of(_b, struct btree, c)->key.k.p
-               : container_of(_b, struct bkey_cached, c)->key.pos;
-}
-
-/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree_trans *trans,
-                           struct btree_path *path,
-                           struct btree *b,
-                           struct bpos pos, unsigned level,
-                           enum six_lock_type type,
-                           six_lock_should_sleep_fn should_sleep_fn, void *p,
-                           unsigned long ip)
-{
-       struct btree_path *linked;
-       unsigned reason;
-
-       /* Check if it's safe to block: */
-       trans_for_each_path(trans, linked) {
-               if (!linked->nodes_locked)
-                       continue;
-
-               /*
-                * Can't block taking an intent lock if we have _any_ nodes read
-                * locked:
-                *
-                * - Our read lock blocks another thread with an intent lock on
-                *   the same node from getting a write lock, and thus from
-                *   dropping its intent lock
-                *
-                * - And the other thread may have multiple nodes intent locked:
-                *   both the node we want to intent lock, and the node we
-                *   already have read locked - deadlock:
-                */
-               if (type == SIX_LOCK_intent &&
-                   linked->nodes_locked != linked->nodes_intent_locked) {
-                       reason = 1;
-                       goto deadlock;
-               }
-
-               if (linked->btree_id != path->btree_id) {
-                       if (linked->btree_id < path->btree_id)
-                               continue;
-
-                       reason = 3;
-                       goto deadlock;
-               }
-
-               /*
-                * Within the same btree, non-cached paths come before cached
-                * paths:
-                */
-               if (linked->cached != path->cached) {
-                       if (!linked->cached)
-                               continue;
-
-                       reason = 4;
-                       goto deadlock;
-               }
-
-               /*
-                * Interior nodes must be locked before their descendants: if
-                * another path has possible descendants locked of the node
-                * we're about to lock, it must have the ancestors locked too:
-                */
-               if (level > __fls(linked->nodes_locked)) {
-                       reason = 5;
-                       goto deadlock;
-               }
-
-               /* Must lock btree nodes in key order: */
-               if (btree_node_locked(linked, level) &&
-                   bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-                                                linked->cached)) <= 0) {
-                       BUG_ON(trans->in_traverse_all);
-                       reason = 7;
-                       goto deadlock;
-               }
-       }
-
-       return btree_node_lock_type(trans, path, b, pos, level,
-                                   type, should_sleep_fn, p);
-deadlock:
-       trace_trans_restart_would_deadlock(trans->fn, ip,
-                       trans->in_traverse_all, reason,
-                       linked->btree_id,
-                       linked->cached,
-                       &linked->pos,
-                       path->btree_id,
-                       path->cached,
-                       &pos);
-       btree_trans_restart(trans);
-       return false;
-}
-
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_locks(struct btree_path *path)
-{
-       unsigned l;
-
-       if (!path->nodes_locked) {
-               BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
-                      btree_path_node(path, path->level));
-               return;
-       }
-
-       for (l = 0; btree_path_node(path, l); l++)
-               BUG_ON(btree_lock_want(path, l) !=
-                      btree_node_locked_type(path, l));
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
-       struct btree_path *path;
-
-       trans_for_each_path(trans, path)
-               bch2_btree_path_verify_locks(path);
-}
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-#endif
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-bool bch2_btree_path_relock_intent(struct btree_trans *trans,
-                                  struct btree_path *path)
-{
-       unsigned l;
-
-       for (l = path->level;
-            l < path->locks_want && btree_path_node(path, l);
-            l++) {
-               if (!bch2_btree_node_relock(trans, path, l)) {
-                       __bch2_btree_path_unlock(path);
-                       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-                       trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
-                                                  path->btree_id, &path->pos);
-                       btree_trans_restart(trans);
-                       return false;
-               }
-       }
-
-       return true;
-}
-
-__flatten
-static bool bch2_btree_path_relock(struct btree_trans *trans,
-                       struct btree_path *path, unsigned long trace_ip)
-{
-       bool ret = btree_path_get_locks(trans, path, false);
-
-       if (!ret) {
-               trace_trans_restart_relock_path(trans->fn, trace_ip,
-                                               path->btree_id, &path->pos);
-               btree_trans_restart(trans);
-       }
-       return ret;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
-                              struct btree_path *path,
-                              unsigned new_locks_want)
-{
-       struct btree_path *linked;
-
-       EBUG_ON(path->locks_want >= new_locks_want);
-
-       path->locks_want = new_locks_want;
-
-       if (btree_path_get_locks(trans, path, true))
-               return true;
-
-       /*
-        * XXX: this is ugly - we'd prefer to not be mucking with other
-        * iterators in the btree_trans here.
-        *
-        * On failure to upgrade the iterator, setting iter->locks_want and
-        * calling get_locks() is sufficient to make bch2_btree_path_traverse()
-        * get the locks we want on transaction restart.
-        *
-        * But if this iterator was a clone, on transaction restart what we did
-        * to this iterator isn't going to be preserved.
-        *
-        * Possibly we could add an iterator field for the parent iterator when
-        * an iterator is a copy - for now, we'll just upgrade any other
-        * iterators with the same btree id.
-        *
-        * The code below used to be needed to ensure ancestor nodes get locked
-        * before interior nodes - now that's handled by
-        * bch2_btree_path_traverse_all().
-        */
-       trans_for_each_path(trans, linked)
-               if (linked != path &&
-                   linked->cached == path->cached &&
-                   linked->btree_id == path->btree_id &&
-                   linked->locks_want < new_locks_want) {
-                       linked->locks_want = new_locks_want;
-                       btree_path_get_locks(trans, linked, true);
-               }
-
-       return false;
-}
-
-void __bch2_btree_path_downgrade(struct btree_path *path,
-                                unsigned new_locks_want)
-{
-       unsigned l;
-
-       EBUG_ON(path->locks_want < new_locks_want);
-
-       path->locks_want = new_locks_want;
-
-       while (path->nodes_locked &&
-              (l = __fls(path->nodes_locked)) >= path->locks_want) {
-               if (l > path->level) {
-                       btree_node_unlock(path, l);
-               } else {
-                       if (btree_node_intent_locked(path, l)) {
-                               six_lock_downgrade(&path->l[l].b->c.lock);
-                               path->nodes_intent_locked ^= 1 << l;
-                       }
-                       break;
-               }
-       }
-
-       bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
-       struct btree_path *path;
-
-       trans_for_each_path(trans, path)
-               bch2_btree_path_downgrade(path);
-}
-
-/* Btree transaction locking: */
-
-bool bch2_trans_relock(struct btree_trans *trans)
-{
-       struct btree_path *path;
-
-       if (unlikely(trans->restarted))
-               return false;
-
-       trans_for_each_path(trans, path)
-               if (path->should_be_locked &&
-                   !bch2_btree_path_relock(trans, path, _RET_IP_)) {
-                       trace_trans_restart_relock(trans->fn, _RET_IP_,
-                                       path->btree_id, &path->pos);
-                       BUG_ON(!trans->restarted);
-                       return false;
-               }
-       return true;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
-       struct btree_path *path;
-
-       trans_for_each_path(trans, path)
-               __bch2_btree_path_unlock(path);
-
-       BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-}
-
 /* Btree iterator: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -579,7 +150,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
               bkey_cmp(ck->key.pos, path->pos));
 
        if (!locked)
-               btree_node_unlock(path, 0);
+               btree_node_unlock(trans, path, 0);
 }
 
 static void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -589,7 +160,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
        struct btree_node_iter tmp;
        bool locked;
        struct bkey_packed *p, *k;
-       char buf1[100], buf2[100], buf3[100];
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       struct printbuf buf3 = PRINTBUF;
        const char *msg;
 
        if (!bch2_debug_check_iterators)
@@ -608,7 +181,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
        if (!btree_path_node(path, level))
                return;
 
-       if (!bch2_btree_node_relock(trans, path, level))
+       if (!bch2_btree_node_relock_notrace(trans, path, level))
                return;
 
        BUG_ON(!btree_path_pos_in_node(path, l->b));
@@ -634,29 +207,30 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
        }
 
        if (!locked)
-               btree_node_unlock(path, level);
+               btree_node_unlock(trans, path, level);
        return;
 err:
-       strcpy(buf2, "(none)");
-       strcpy(buf3, "(none)");
-
-       bch2_bpos_to_text(&PBUF(buf1), path->pos);
+       bch2_bpos_to_text(&buf1, path->pos);
 
        if (p) {
                struct bkey uk = bkey_unpack_key(l->b, p);
-               bch2_bkey_to_text(&PBUF(buf2), &uk);
+               bch2_bkey_to_text(&buf2, &uk);
+       } else {
+               prt_printf(&buf2, "(none)");
        }
 
        if (k) {
                struct bkey uk = bkey_unpack_key(l->b, k);
-               bch2_bkey_to_text(&PBUF(buf3), &uk);
+               bch2_bkey_to_text(&buf3, &uk);
+       } else {
+               prt_printf(&buf3, "(none)");
        }
 
        panic("path should be %s key at level %u:\n"
              "path pos %s\n"
              "prev key %s\n"
              "cur  key %s\n",
-             msg, level, buf1, buf2, buf3);
+             msg, level, buf1.buf, buf2.buf, buf3.buf);
 }
 
 static void bch2_btree_path_verify(struct btree_trans *trans,
@@ -754,16 +328,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
        if (!bkey_cmp(prev.k->p, k.k->p) &&
            bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
                                      prev.k->p.snapshot) > 0) {
-               char buf1[100], buf2[200];
+               struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-               bch2_bkey_to_text(&PBUF(buf1), k.k);
-               bch2_bkey_to_text(&PBUF(buf2), prev.k);
+               bch2_bkey_to_text(&buf1, k.k);
+               bch2_bkey_to_text(&buf2, prev.k);
 
                panic("iter snap %u\n"
                      "k    %s\n"
                      "prev %s\n",
                      iter->snapshot,
-                     buf1, buf2);
+                     buf1.buf, buf2.buf);
        }
 out:
        bch2_trans_iter_exit(trans, &copy);
@@ -775,7 +349,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 {
        struct btree_path *path;
        unsigned idx;
-       char buf[100];
+       struct printbuf buf = PRINTBUF;
 
        trans_for_each_path_inorder(trans, path, idx) {
                int cmp = cmp_int(path->btree_id, id) ?:
@@ -786,7 +360,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
                if (cmp < 0)
                        continue;
 
-               if (!(path->nodes_locked & 1) ||
+               if (!btree_node_locked(path, 0) ||
                    !path->should_be_locked)
                        continue;
 
@@ -801,9 +375,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
        }
 
        bch2_dump_trans_paths_updates(trans);
+       bch2_bpos_to_text(&buf, pos);
+
        panic("not locked: %s %s%s\n",
-             bch2_btree_ids[id],
-             (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+             bch2_btree_ids[id], buf.buf,
              key_cache ? " cached" : "");
 }
 
@@ -1009,27 +584,29 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
                        bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
                                                    struct btree_path *path,
                                                    struct btree_path_level *l,
                                                    struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+       struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
        path->pos = k.k ? k.k->p : l->b->key.k.p;
+       bch2_btree_path_verify_level(trans, path, l - path->l);
        return k;
 }
 
-static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
                                                    struct btree_path *path,
                                                    struct btree_path_level *l,
                                                    struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+       struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
 
        path->pos = k.k ? k.k->p : l->b->data->min_key;
+       bch2_btree_path_verify_level(trans, path, l - path->l);
        return k;
 }
 
@@ -1052,61 +629,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
        return true;
 }
 
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_path_verify_new_node(struct btree_trans *trans,
-                                      struct btree_path *path, struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_path_level *l;
-       unsigned plevel;
-       bool parent_locked;
-       struct bkey_packed *k;
-
-       if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-               return;
-
-       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-               return;
-
-       plevel = b->c.level + 1;
-       if (!btree_path_node(path, plevel))
-               return;
-
-       parent_locked = btree_node_locked(path, plevel);
-
-       if (!bch2_btree_node_relock(trans, path, plevel))
-               return;
-
-       l = &path->l[plevel];
-       k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-       if (!k ||
-           bkey_deleted(k) ||
-           bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-               char buf1[100];
-               char buf2[100];
-               char buf3[100];
-               char buf4[100];
-               struct bkey uk = bkey_unpack_key(b, k);
-
-               bch2_dump_btree_node(c, l->b);
-               bch2_bpos_to_text(&PBUF(buf1), path->pos);
-               bch2_bkey_to_text(&PBUF(buf2), &uk);
-               bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
-               bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
-               panic("parent iter doesn't point to new node:\n"
-                     "iter pos %s %s\n"
-                     "iter key %s\n"
-                     "new node %s-%s\n",
-                     bch2_btree_ids[path->btree_id], buf1,
-                     buf2, buf3, buf4);
-       }
-
-       if (!parent_locked)
-               btree_node_unlock(path, plevel);
-}
-
 static inline void __btree_path_level_init(struct btree_path *path,
                                           unsigned level)
 {
@@ -1122,14 +644,12 @@ static inline void __btree_path_level_init(struct btree_path *path,
                bch2_btree_node_iter_peek(&l->iter, l->b);
 }
 
-static inline void btree_path_level_init(struct btree_trans *trans,
-                                        struct btree_path *path,
-                                        struct btree *b)
+inline void bch2_btree_path_level_init(struct btree_trans *trans,
+                                      struct btree_path *path,
+                                      struct btree *b)
 {
        BUG_ON(path->cached);
 
-       btree_path_verify_new_node(trans, path, b);
-
        EBUG_ON(!btree_path_pos_in_node(path, b));
        EBUG_ON(b->c.lock.state.seq & 1);
 
@@ -1149,19 +669,19 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
        struct btree_path *path;
 
        trans_for_each_path(trans, path)
-               if (!path->cached &&
+               if (path->uptodate == BTREE_ITER_UPTODATE &&
+                   !path->cached &&
                    btree_path_pos_in_node(path, b)) {
                        enum btree_node_locked_type t =
                                btree_lock_want(path, b->c.level);
 
-                       if (path->nodes_locked &&
-                           t != BTREE_NODE_UNLOCKED) {
-                               btree_node_unlock(path, b->c.level);
+                       if (t != BTREE_NODE_UNLOCKED) {
+                               btree_node_unlock(trans, path, b->c.level);
                                six_lock_increment(&b->c.lock, t);
-                               mark_btree_node_locked(path, b->c.level, t);
+                               mark_btree_node_locked(trans, path, b->c.level, t);
                        }
 
-                       btree_path_level_init(trans, path, b);
+                       bch2_btree_path_level_init(trans, path, b);
                }
 }
 
@@ -1179,14 +699,6 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 
 /* Btree path: traverse, set_pos: */
 
-static int lock_root_check_fn(struct six_lock *lock, void *p)
-{
-       struct btree *b = container_of(lock, struct btree, c.lock);
-       struct btree **rootp = p;
-
-       return b == *rootp ? 0 : -1;
-}
-
 static inline int btree_path_lock_root(struct btree_trans *trans,
                                       struct btree_path *path,
                                       unsigned depth_want,
@@ -1196,6 +708,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
        struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
        enum six_lock_type lock_type;
        unsigned i;
+       int ret;
 
        EBUG_ON(path->nodes_locked);
 
@@ -1217,26 +730,27 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                }
 
                lock_type = __btree_lock_want(path, path->level);
-               if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
-                                             path->level, lock_type,
-                                             lock_root_check_fn, rootp,
-                                             trace_ip))) {
-                       if (trans->restarted)
-                               return -EINTR;
-                       continue;
+               ret = btree_node_lock(trans, path, &b->c,
+                                     path->level, lock_type, trace_ip);
+               if (unlikely(ret)) {
+                       if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+                               continue;
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               return ret;
+                       BUG();
                }
 
                if (likely(b == READ_ONCE(*rootp) &&
                           b->c.level == path->level &&
                           !race_fault())) {
                        for (i = 0; i < path->level; i++)
-                               path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+                               path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
                        path->l[path->level].b = b;
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(path, path->level, lock_type);
-                       btree_path_level_init(trans, path, b);
+                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       bch2_btree_path_level_init(trans, path, b);
                        return 0;
                }
 
@@ -1260,7 +774,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
 
        bch2_bkey_buf_init(&tmp);
 
-       while (nr && !ret) {
+       while (nr-- && !ret) {
                if (!bch2_btree_node_relock(trans, path, path->level))
                        break;
 
@@ -1275,7 +789,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
        }
 
        if (!was_locked)
-               btree_node_unlock(path, path->level);
+               btree_node_unlock(trans, path, path->level);
 
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
@@ -1295,7 +809,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
 
        bch2_bkey_buf_init(&tmp);
 
-       while (nr && !ret) {
+       while (nr-- && !ret) {
                if (!bch2_btree_node_relock(trans, path, path->level))
                        break;
 
@@ -1310,7 +824,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
        }
 
        if (!was_locked)
-               btree_node_unlock(path, path->level);
+               btree_node_unlock(trans, path, path->level);
 
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
@@ -1335,7 +849,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
        bp->mem_ptr = (unsigned long)b;
 
        if (!locked)
-               btree_node_unlock(path, plevel);
+               btree_node_unlock(trans, path, plevel);
 }
 
 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
@@ -1400,16 +914,16 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        if (unlikely(ret))
                goto err;
 
-       mark_btree_node_locked(path, level, lock_type);
-       btree_path_level_init(trans, path, b);
-
        if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
            unlikely(b != btree_node_mem_ptr(tmp.k)))
                btree_node_mem_ptr_set(trans, path, level + 1, b);
 
        if (btree_node_read_locked(path, level + 1))
-               btree_node_unlock(path, level + 1);
+               btree_node_unlock(trans, path, level + 1);
+
+       mark_btree_node_locked(trans, path, level, lock_type);
        path->level = level;
+       bch2_btree_path_level_init(trans, path, b);
 
        bch2_btree_path_verify_locks(path);
 err:
@@ -1420,40 +934,30 @@ err:
 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
                                   unsigned, unsigned long);
 
-static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
-                                    unsigned long trace_ip)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct btree_path *path;
-       int i;
+       unsigned long trace_ip = _RET_IP_;
+       int ret = 0;
 
        if (trans->in_traverse_all)
-               return -EINTR;
+               return -BCH_ERR_transaction_restart_in_traverse_all;
 
        trans->in_traverse_all = true;
 retry_all:
-       trans->restarted = false;
+       trans->restarted = 0;
+       trans->traverse_all_idx = U8_MAX;
 
        trans_for_each_path(trans, path)
                path->should_be_locked = false;
 
        btree_trans_verify_sorted(trans);
 
-       for (i = trans->nr_sorted - 2; i >= 0; --i) {
-               struct btree_path *path1 = trans->paths + trans->sorted[i];
-               struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
-
-               if (path1->btree_id == path2->btree_id &&
-                   path1->locks_want < path2->locks_want)
-                       __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
-               else if (!path1->locks_want && path2->locks_want)
-                       __bch2_btree_path_upgrade(trans, path1, 1);
-       }
-
        bch2_trans_unlock(trans);
        cond_resched();
 
-       if (unlikely(ret == -ENOMEM)) {
+       if (unlikely(trans->memory_allocation_failure)) {
                struct closure cl;
 
                closure_init_stack(&cl);
@@ -1464,15 +968,10 @@ retry_all:
                } while (ret);
        }
 
-       if (unlikely(ret == -EIO))
-               goto out;
-
-       BUG_ON(ret && ret != -EINTR);
-
        /* Now, redo traversals in correct order: */
-       i = 0;
-       while (i < trans->nr_sorted) {
-               path = trans->paths + trans->sorted[i];
+       trans->traverse_all_idx = 0;
+       while (trans->traverse_all_idx < trans->nr_sorted) {
+               path = trans->paths + trans->sorted[trans->traverse_all_idx];
 
                /*
                 * Traversing a path can cause another path to be added at about
@@ -1480,10 +979,14 @@ retry_all:
                 */
                if (path->uptodate) {
                        ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-                       if (ret)
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+                           ret == -ENOMEM)
                                goto retry_all;
+                       if (ret)
+                               goto err;
+                       BUG_ON(path->uptodate);
                } else {
-                       i++;
+                       trans->traverse_all_idx++;
                }
        }
 
@@ -1494,62 +997,83 @@ retry_all:
         */
        trans_for_each_path(trans, path)
                BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-out:
+err:
        bch2_btree_cache_cannibalize_unlock(c);
 
        trans->in_traverse_all = false;
 
-       trace_trans_traverse_all(trans->fn, trace_ip);
+       trace_and_count(c, trans_traverse_all, trans, trace_ip);
        return ret;
 }
 
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+                                               unsigned l, int check_pos)
 {
-       return __btree_path_traverse_all(trans, 0, _RET_IP_);
+       if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+               return false;
+       if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+               return false;
+       return true;
 }
 
 static inline bool btree_path_good_node(struct btree_trans *trans,
                                        struct btree_path *path,
                                        unsigned l, int check_pos)
 {
-       if (!is_btree_node(path, l) ||
-           !bch2_btree_node_relock(trans, path, l))
-               return false;
+       return is_btree_node(path, l) &&
+               bch2_btree_node_relock(trans, path, l) &&
+               btree_path_check_pos_in_node(path, l, check_pos);
+}
 
-       if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
-               return false;
-       if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
-               return false;
-       return true;
+static void btree_path_set_level_down(struct btree_trans *trans,
+                                     struct btree_path *path,
+                                     unsigned new_level)
+{
+       unsigned l;
+
+       path->level = new_level;
+
+       for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+               if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+                       btree_node_unlock(trans, path, l);
+
+       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+       bch2_btree_path_verify(trans, path);
 }
 
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-                                                    struct btree_path *path,
-                                                    int check_pos)
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+                                                        struct btree_path *path,
+                                                        int check_pos)
 {
        unsigned i, l = path->level;
-
+again:
        while (btree_path_node(path, l) &&
-              !btree_path_good_node(trans, path, l, check_pos)) {
-               btree_node_unlock(path, l);
-               path->l[l].b = BTREE_ITER_NO_NODE_UP;
-               l++;
-       }
+              !btree_path_good_node(trans, path, l, check_pos))
+               __btree_path_set_level_up(trans, path, l++);
 
        /* If we need intent locks, take them too: */
        for (i = l + 1;
             i < path->locks_want && btree_path_node(path, i);
             i++)
-               if (!bch2_btree_node_relock(trans, path, i))
-                       while (l <= i) {
-                               btree_node_unlock(path, l);
-                               path->l[l].b = BTREE_ITER_NO_NODE_UP;
-                               l++;
-                       }
+               if (!bch2_btree_node_relock(trans, path, i)) {
+                       while (l <= i)
+                               __btree_path_set_level_up(trans, path, l++);
+                       goto again;
+               }
 
        return l;
 }
 
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+                                                    struct btree_path *path,
+                                                    int check_pos)
+{
+       return likely(btree_node_locked(path, path->level) &&
+                     btree_path_check_pos_in_node(path, path->level, check_pos))
+               ? path->level
+               : __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
 /*
  * This is the main state machine for walking down the btree - walks down to a
  * specified depth
@@ -1565,19 +1089,17 @@ static int btree_path_traverse_one(struct btree_trans *trans,
                                   unsigned long trace_ip)
 {
        unsigned depth_want = path->level;
-       int ret = 0;
+       int ret = trans->restarted;
 
-       if (unlikely(trans->restarted)) {
-               ret = -EINTR;
+       if (unlikely(ret))
                goto out;
-       }
 
        /*
         * Ensure we obey path->should_be_locked: if it's set, we can't unlock
         * and re-traverse the path without a transaction restart:
         */
        if (path->should_be_locked) {
-               ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
+               ret = bch2_btree_path_relock(trans, path, trace_ip);
                goto out;
        }
 
@@ -1591,6 +1113,9 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
        path->level = btree_path_up_until_good_node(trans, path, 0);
 
+       EBUG_ON(btree_path_node(path, path->level) &&
+               !btree_node_locked(path, path->level));
+
        /*
         * Note: path->nodes[path->level] may be temporarily NULL here - that
         * would indicate to other code that we got to the end of the btree,
@@ -1611,31 +1136,33 @@ static int btree_path_traverse_one(struct btree_trans *trans,
                                goto out;
                        }
 
-                       __bch2_btree_path_unlock(path);
+                       __bch2_btree_path_unlock(trans, path);
                        path->level = depth_want;
-
-                       if (ret == -EIO)
-                               path->l[path->level].b =
-                                       BTREE_ITER_NO_NODE_ERROR;
-                       else
-                               path->l[path->level].b =
-                                       BTREE_ITER_NO_NODE_DOWN;
+                       path->l[path->level].b = ERR_PTR(ret);
                        goto out;
                }
        }
 
        path->uptodate = BTREE_ITER_UPTODATE;
 out:
-       BUG_ON((ret == -EINTR) != !!trans->restarted);
+       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
        bch2_btree_path_verify(trans, path);
        return ret;
 }
 
-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-
 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
                                          struct btree_path *path, unsigned flags)
 {
+       if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
+               u64 mask = ~(~0ULL << restart_probability_bits);
+
+               if ((prandom_u32() & mask) == mask) {
+                       trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
+               }
+       }
+
        if (path->uptodate < BTREE_ITER_NEED_RELOCK)
                return 0;
 
@@ -1646,17 +1173,22 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
                            struct btree_path *src)
 {
-       unsigned i;
+       unsigned i, offset = offsetof(struct btree_path, pos);
+       int cmp = btree_path_cmp(dst, src);
 
-       memcpy(&dst->pos, &src->pos,
-              sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+       memcpy((void *) dst + offset,
+              (void *) src + offset,
+              sizeof(struct btree_path) - offset);
 
-       for (i = 0; i < BTREE_MAX_DEPTH; i++)
-               if (btree_node_locked(dst, i))
-                       six_lock_increment(&dst->l[i].b->c.lock,
-                                          __btree_lock_want(dst, i));
+       for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+               unsigned t = btree_node_locked_type(dst, i);
 
-       btree_path_check_sort(trans, dst, 0);
+               if (t != BTREE_NODE_UNLOCKED)
+                       six_lock_increment(&dst->l[i].b->c.lock, t);
+       }
+
+       if (cmp)
+               bch2_btree_path_check_sort_fast(trans, dst, cmp);
 }
 
 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -1669,8 +1201,7 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
        return new;
 }
 
-inline struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
                         struct btree_path *path, bool intent,
                         unsigned long ip)
 {
@@ -1684,6 +1215,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
                btree_trans_verify_sorted(trans);
        }
 
+       path->should_be_locked = false;
        return path;
 }
 
@@ -1703,14 +1235,13 @@ bch2_btree_path_set_pos(struct btree_trans *trans,
 
        path = bch2_btree_path_make_mut(trans, path, intent, ip);
 
-       path->pos               = new_pos;
-       path->should_be_locked  = false;
+       path->pos = new_pos;
 
-       btree_path_check_sort(trans, path, cmp);
+       bch2_btree_path_check_sort_fast(trans, path, cmp);
 
        if (unlikely(path->cached)) {
-               btree_node_unlock(path, 0);
-               path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+               btree_node_unlock(trans, path, 0);
+               path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
                btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
                goto out;
        }
@@ -1718,6 +1249,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans,
        l = btree_path_up_until_good_node(trans, path, cmp);
 
        if (btree_path_node(path, l)) {
+               BUG_ON(!btree_node_locked(path, l));
                /*
                 * We might have to skip over many keys, or just a few: try
                 * advancing the node iterator, and if we have to skip over too
@@ -1729,9 +1261,9 @@ bch2_btree_path_set_pos(struct btree_trans *trans,
                        __btree_path_level_init(path, l);
        }
 
-       if (l != path->level) {
+       if (unlikely(l != path->level)) {
                btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-               __bch2_btree_path_unlock(path);
+               __bch2_btree_path_unlock(trans, path);
        }
 out:
        bch2_btree_path_verify(trans, path);
@@ -1742,37 +1274,37 @@ out:
 
 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-       struct btree_path *next;
+       struct btree_path *sib;
 
-       next = prev_btree_path(trans, path);
-       if (next && !btree_path_cmp(next, path))
-               return next;
+       sib = prev_btree_path(trans, path);
+       if (sib && !btree_path_cmp(sib, path))
+               return sib;
 
-       next = next_btree_path(trans, path);
-       if (next && !btree_path_cmp(next, path))
-               return next;
+       sib = next_btree_path(trans, path);
+       if (sib && !btree_path_cmp(sib, path))
+               return sib;
 
        return NULL;
 }
 
 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-       struct btree_path *next;
+       struct btree_path *sib;
 
-       next = prev_btree_path(trans, path);
-       if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
-               return next;
+       sib = prev_btree_path(trans, path);
+       if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+               return sib;
 
-       next = next_btree_path(trans, path);
-       if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
-               return next;
+       sib = next_btree_path(trans, path);
+       if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+               return sib;
 
        return NULL;
 }
 
 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 {
-       __bch2_btree_path_unlock(path);
+       __bch2_btree_path_unlock(trans, path);
        btree_path_list_remove(trans, path);
        trans->paths_allocated &= ~(1ULL << path->idx);
 }
@@ -1787,88 +1319,165 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
        if (!__btree_path_put(path, intent))
                return;
 
-       /*
-        * Perhaps instead we should check for duplicate paths in traverse_all:
-        */
-       if (path->preserve &&
-           (dup = have_path_at_pos(trans, path))) {
-               dup->preserve = true;
-               path->preserve = false;
-               goto free;
-       }
+       dup = path->preserve
+               ? have_path_at_pos(trans, path)
+               : have_node_at_pos(trans, path);
+
+       if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+               return;
 
-       if (!path->preserve &&
-           (dup = have_node_at_pos(trans, path)))
-               goto free;
-       return;
-free:
        if (path->should_be_locked &&
-           !btree_node_locked(dup, path->level))
+           !trans->restarted &&
+           (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
                return;
 
-       dup->should_be_locked |= path->should_be_locked;
-       __bch2_path_free(trans, path);
+       if (dup) {
+               dup->preserve           |= path->preserve;
+               dup->should_be_locked   |= path->should_be_locked;
+       }
+
+       __bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+                                bool intent)
+{
+       EBUG_ON(trans->paths + path->idx != path);
+       EBUG_ON(!path->ref);
+
+       if (!__btree_path_put(path, intent))
+               return;
+
+       __bch2_path_free(trans, path);
+}
+
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       prt_printf(buf, "transaction updates for %s journal seq %llu",
+              trans->fn, trans->journal_res.seq);
+       prt_newline(buf);
+       printbuf_indent_add(buf, 2);
+
+       trans_for_each_update(trans, i) {
+               struct bkey_s_c old = { &i->old_k, i->old_v };
+
+               prt_printf(buf, "update: btree=%s cached=%u %pS",
+                      bch2_btree_ids[i->btree_id],
+                      i->cached,
+                      (void *) i->ip_allocated);
+               prt_newline(buf);
+
+               prt_printf(buf, "  old ");
+               bch2_bkey_val_to_text(buf, trans->c, old);
+               prt_newline(buf);
+
+               prt_printf(buf, "  new ");
+               bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+               prt_newline(buf);
+       }
+
+       printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+       struct printbuf buf = PRINTBUF;
+
+       bch2_trans_updates_to_text(&buf, trans);
+       bch2_print_string_as_lines(KERN_ERR, buf.buf);
+       printbuf_exit(&buf);
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+       prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+                  path->idx, path->ref, path->intent_ref,
+                  path->preserve ? 'P' : ' ',
+                  path->should_be_locked ? 'S' : ' ',
+                  bch2_btree_ids[path->btree_id],
+                  path->level);
+       bch2_bpos_to_text(out, path->pos);
+
+       prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef CONFIG_BCACHEFS_DEBUG
+       prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+       prt_newline(out);
+}
+
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+       struct btree_path *path;
+       unsigned idx;
+
+       trans_for_each_path_inorder(trans, path, idx)
+               bch2_btree_path_to_text(out, path);
 }
 
 noinline __cold
 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 {
-       struct btree_path *path;
-       struct btree_insert_entry *i;
-       unsigned idx;
-       char buf1[300], buf2[300];
+       struct printbuf buf = PRINTBUF;
 
-       btree_trans_verify_sorted(trans);
+       bch2_trans_paths_to_text(&buf, trans);
+       bch2_trans_updates_to_text(&buf, trans);
 
-       trans_for_each_path_inorder(trans, path, idx)
-               printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
-                      path->idx, path->ref, path->intent_ref,
-                      path->should_be_locked ? " S" : "",
-                      path->preserve ? " P" : "",
-                      bch2_btree_ids[path->btree_id],
-                      (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
-                      path->nodes_locked,
-#ifdef CONFIG_BCACHEFS_DEBUG
-                      (void *) path->ip_allocated
-#else
-                      NULL
-#endif
-                      );
+       bch2_print_string_as_lines(KERN_ERR, buf.buf);
+       printbuf_exit(&buf);
+}
 
-       trans_for_each_update(trans, i) {
-               struct bkey u;
-               struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+noinline
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+       struct btree_transaction_stats *s = btree_trans_stats(trans);
+       struct printbuf buf = PRINTBUF;
 
-               printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
-                      bch2_btree_ids[i->btree_id],
-                      (void *) i->ip_allocated,
-                      (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
-                      (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+       bch2_trans_paths_to_text(&buf, trans);
+
+       if (!buf.allocation_failure) {
+               mutex_lock(&s->lock);
+               if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+                       s->nr_max_paths = trans->nr_max_paths =
+                               hweight64(trans->paths_allocated);
+                       swap(s->max_paths_text, buf.buf);
+               }
+               mutex_unlock(&s->lock);
        }
+
+       printbuf_exit(&buf);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+       bch2_dump_trans_paths_updates(trans);
+       panic("trans path oveflow\n");
 }
 
-static struct btree_path *btree_path_alloc(struct btree_trans *trans,
-                                          struct btree_path *pos)
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+                                                 struct btree_path *pos)
 {
        struct btree_path *path;
        unsigned idx;
 
        if (unlikely(trans->paths_allocated ==
-                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
-               bch2_dump_trans_paths_updates(trans);
-               panic("trans path oveflow\n");
-       }
+                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+               btree_path_overflow(trans);
 
        idx = __ffs64(~trans->paths_allocated);
        trans->paths_allocated |= 1ULL << idx;
 
+       if (unlikely(idx > trans->nr_max_paths))
+               bch2_trans_update_max_paths(trans);
+
        path = &trans->paths[idx];
 
        path->idx               = idx;
        path->ref               = 0;
        path->intent_ref        = 0;
        path->nodes_locked      = 0;
-       path->nodes_intent_locked = 0;
 
        btree_path_list_add(trans, pos, path);
        return path;
@@ -1885,6 +1494,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
        int i;
 
        BUG_ON(trans->restarted);
+       btree_trans_verify_sorted(trans);
+       bch2_trans_verify_locks(trans);
 
        trans_for_each_path_inorder(trans, path, i) {
                if (__btree_path_cmp(path,
@@ -1916,9 +1527,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
                path->level                     = level;
                path->locks_want                = locks_want;
                path->nodes_locked              = 0;
-               path->nodes_intent_locked       = 0;
                for (i = 0; i < ARRAY_SIZE(path->l); i++)
-                       path->l[i].b            = BTREE_ITER_NO_NODE_INIT;
+                       path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
 #ifdef CONFIG_BCACHEFS_DEBUG
                path->ip_allocated              = ip;
 #endif
@@ -1940,10 +1550,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
         */
 
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
-       if (locks_want > path->locks_want) {
-               path->locks_want = locks_want;
-               btree_path_get_locks(trans, path, true);
-       }
+       if (locks_want > path->locks_want)
+               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
 
        return path;
 }
@@ -1951,14 +1559,17 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
 {
 
+       struct btree_path_level *l = path_l(path);
+       struct bkey_packed *_k;
        struct bkey_s_c k;
 
-       if (!path->cached) {
-               struct btree_path_level *l = path_l(path);
-               struct bkey_packed *_k;
+       if (unlikely(!l->b))
+               return bkey_s_c_null;
 
-               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+       EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+       EBUG_ON(!btree_node_locked(path, path->level));
 
+       if (!path->cached) {
                _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
                k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 
@@ -1972,13 +1583,9 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
                EBUG_ON(ck &&
                        (path->btree_id != ck->key.btree_id ||
                         bkey_cmp(path->pos, ck->key.pos)));
+               EBUG_ON(!ck || !ck->valid);
 
-               /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
-               if (unlikely(!ck || !ck->valid))
-                       return bkey_s_c_null;
-
-               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
+               *u = ck->k->k;
                k = bkey_i_to_s_c(ck->k);
        }
 
@@ -2011,7 +1618,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
        if (ret)
                return ret;
 
-       iter->path->should_be_locked = true;
+       btree_path_set_should_be_locked(iter->path);
        return 0;
 }
 
@@ -2042,8 +1649,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
-       iter->path->should_be_locked = true;
-       BUG_ON(iter->path->uptodate);
+       btree_path_set_should_be_locked(iter->path);
 out:
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
@@ -2059,7 +1665,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        struct btree_trans *trans = iter->trans;
        struct btree_path *path = iter->path;
        struct btree *b = NULL;
-       unsigned l;
        int ret;
 
        BUG_ON(trans->restarted);
@@ -2072,29 +1677,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
        /* got to end? */
        if (!btree_path_node(path, path->level + 1)) {
-               btree_node_unlock(path, path->level);
-               path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-               path->level++;
+               btree_path_set_level_up(trans, path);
                return NULL;
        }
 
        if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-               __bch2_btree_path_unlock(path);
-               path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-               path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-               trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
-                                          path->btree_id, &path->pos);
-               btree_trans_restart(trans);
-               ret = -EINTR;
+               __bch2_btree_path_unlock(trans, path);
+               path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+               path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
                goto err;
        }
 
        b = btree_path_node(path, path->level + 1);
 
        if (!bpos_cmp(iter->pos, b->key.k.p)) {
-               btree_node_unlock(path, path->level);
-               path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-               path->level++;
+               __btree_path_set_level_up(trans, path, path->level++);
        } else {
                /*
                 * Haven't gotten to the end of the parent node: go back down to
@@ -2105,14 +1705,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
                                           iter->flags & BTREE_ITER_INTENT,
                                           btree_iter_ip_allocated(iter));
 
-               path->level = iter->min_depth;
-
-               for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-                       if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-                               btree_node_unlock(path, l);
-
-               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-               bch2_btree_iter_verify(iter);
+               btree_path_set_level_down(trans, path, iter->min_depth);
 
                ret = bch2_btree_path_traverse(trans, path, iter->flags);
                if (ret)
@@ -2127,7 +1720,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
-       iter->path->should_be_locked = true;
+       btree_path_set_should_be_locked(iter->path);
        BUG_ON(iter->path->uptodate);
 out:
        bch2_btree_iter_verify_entry_exit(iter);
@@ -2143,15 +1736,23 @@ err:
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-       struct bpos pos = iter->k.p;
-       bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-                   ? bpos_cmp(pos, SPOS_MAX)
-                   : bkey_cmp(pos, SPOS_MAX)) != 0;
+       if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+               struct bpos pos = iter->k.p;
+               bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                           ? bpos_cmp(pos, SPOS_MAX)
+                           : bkey_cmp(pos, SPOS_MAX)) != 0;
 
-       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_successor(iter, pos);
-       bch2_btree_iter_set_pos(iter, pos);
-       return ret;
+               if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+                       pos = bkey_successor(iter, pos);
+               bch2_btree_iter_set_pos(iter, pos);
+               return ret;
+       } else {
+               if (!btree_path_node(iter->path, iter->path->level))
+                       return true;
+
+               iter->advanced = true;
+               return false;
+       }
 }
 
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -2172,34 +1773,47 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
                                                      struct bpos pos)
 {
        struct btree_insert_entry *i;
+       struct bkey_i *ret = NULL;
 
-       trans_for_each_update(trans, i)
-               if ((cmp_int(btree_id,  i->btree_id) ?:
-                    bpos_cmp(pos,      i->k->k.p)) <= 0) {
-                       if (btree_id == i->btree_id)
-                               return i->k;
+       trans_for_each_update(trans, i) {
+               if (i->btree_id < btree_id)
+                       continue;
+               if (i->btree_id > btree_id)
                        break;
-               }
+               if (bpos_cmp(i->k->k.p, pos) < 0)
+                       continue;
+               if (i->key_cache_already_flushed)
+                       continue;
+               if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
+                       ret = i->k;
+       }
 
-       return NULL;
+       return ret;
 }
 
-static noinline
-struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
-                                         struct btree_path *path)
+struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      struct bpos start_pos,
+                                      struct bpos end_pos)
 {
-       struct journal_keys *keys = &trans->c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, path->btree_id,
-                                            path->level, path->pos);
+       struct bkey_i *k;
+
+       if (bpos_cmp(start_pos, iter->journal_pos) < 0)
+               iter->journal_idx = 0;
+
+       k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
+                                       start_pos, end_pos,
+                                       &iter->journal_idx);
 
-       while (idx < keys->nr && keys->d[idx].overwritten)
-               idx++;
+       iter->journal_pos = k ? k->k.p : end_pos;
+       return k;
+}
 
-       return (idx < keys->nr &&
-               keys->d[idx].btree_id   == path->btree_id &&
-               keys->d[idx].level      == path->level)
-               ? keys->d[idx].k
-               : NULL;
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
+                                           struct btree_iter *iter,
+                                           struct bpos pos)
+{
+       return bch2_btree_journal_peek(trans, iter, pos, pos);
 }
 
 static noinline
@@ -2208,11 +1822,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
                                         struct bkey_s_c k)
 {
        struct bkey_i *next_journal =
-               __btree_trans_peek_journal(trans, iter->path);
+               bch2_btree_journal_peek(trans, iter, iter->path->pos,
+                               k.k ? k.k->p : iter->path->l[0].b->key.k.p);
 
-       if (next_journal &&
-           bpos_cmp(next_journal->k.p,
-                    k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+       if (next_journal) {
                iter->k = next_journal->k;
                k = bkey_i_to_s_c(next_journal);
        }
@@ -2225,7 +1838,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
  * bkey_s_c_null:
  */
 static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
 {
        struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
@@ -2249,11 +1862,20 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
-       iter->key_cache_path->should_be_locked = true;
+       btree_path_set_should_be_locked(iter->key_cache_path);
 
        return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
 }
 
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+       struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
+       int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
+
+       return err ? bkey_s_c_err(err) : ret;
+}
+
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
        struct btree_trans *trans = iter->trans;
@@ -2261,10 +1883,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
        struct bkey_s_c k, k2;
        int ret;
 
-       EBUG_ON(iter->path->cached || iter->path->level);
+       EBUG_ON(iter->path->cached);
        bch2_btree_iter_verify(iter);
 
        while (1) {
+               struct btree_path_level *l;
+
                iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
@@ -2277,22 +1901,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
                        goto out;
                }
 
-               iter->path->should_be_locked = true;
+               l = path_l(iter->path);
+
+               if (unlikely(!l->b)) {
+                       /* No btree nodes at requested level: */
+                       bch2_btree_iter_set_pos(iter, SPOS_MAX);
+                       k = bkey_s_c_null;
+                       goto out;
+               }
+
+               btree_path_set_should_be_locked(iter->path);
 
-               k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+               k = btree_path_level_peek_all(trans->c, l, &iter->k);
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
                    k.k &&
                    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
-                       ret = bkey_err(k2);
+                       k = k2;
+                       ret = bkey_err(k);
                        if (ret) {
-                               k = k2;
                                bch2_btree_iter_set_pos(iter, iter->pos);
                                goto out;
                        }
-
-                       k = k2;
-                       iter->k = *k.k;
                }
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
@@ -2303,7 +1933,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
                        : NULL;
                if (next_update &&
                    bpos_cmp(next_update->k.p,
-                            k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+                            k.k ? k.k->p : l->b->key.k.p) <= 0) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
                }
@@ -2324,9 +1954,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 
                if (likely(k.k)) {
                        break;
-               } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+               } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
                        /* Advance to next leaf node: */
-                       search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+                       search_key = bpos_successor(l->b->key.k.p);
                } else {
                        /* End of btree: */
                        bch2_btree_iter_set_pos(iter, SPOS_MAX);
@@ -2344,16 +1974,19 @@ out:
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
  */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
        struct btree_trans *trans = iter->trans;
        struct bpos search_key = btree_iter_search_key(iter);
        struct bkey_s_c k;
+       struct bpos iter_pos;
        int ret;
 
+       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+
        if (iter->update_path) {
-               bch2_path_put(trans, iter->update_path,
-                             iter->flags & BTREE_ITER_INTENT);
+               bch2_path_put_nokeep(trans, iter->update_path,
+                                    iter->flags & BTREE_ITER_INTENT);
                iter->update_path = NULL;
        }
 
@@ -2362,12 +1995,30 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        while (1) {
                k = __bch2_btree_iter_peek(iter, search_key);
                if (!k.k || bkey_err(k))
-                       goto out;
+                       goto out_no_locked;
+
+               /*
+                * iter->pos should be mononotically increasing, and always be
+                * equal to the key we just returned - except extents can
+                * straddle iter->pos:
+                */
+               if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+                       iter_pos = k.k->p;
+               else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+                       iter_pos = bkey_start_pos(k.k);
+               else
+                       iter_pos = iter->pos;
+
+               if (bkey_cmp(iter_pos, end) > 0) {
+                       bch2_btree_iter_set_pos(iter, end);
+                       k = bkey_s_c_null;
+                       goto out_no_locked;
+               }
 
                if (iter->update_path &&
                    bkey_cmp(iter->update_path->pos, k.k->p)) {
-                       bch2_path_put(trans, iter->update_path,
-                                     iter->flags & BTREE_ITER_INTENT);
+                       bch2_path_put_nokeep(trans, iter->update_path,
+                                            iter->flags & BTREE_ITER_INTENT);
                        iter->update_path = NULL;
                }
 
@@ -2394,10 +2045,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                        iter->update_path = bch2_btree_path_set_pos(trans,
                                                iter->update_path, pos,
                                                iter->flags & BTREE_ITER_INTENT,
-                                               btree_iter_ip_allocated(iter));
-
-                       BUG_ON(!(iter->update_path->nodes_locked & 1));
-                       iter->update_path->should_be_locked = true;
+                                               _THIS_IP_);
                }
 
                /*
@@ -2421,25 +2069,21 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                break;
        }
 
-       /*
-        * iter->pos should be mononotically increasing, and always be equal to
-        * the key we just returned - except extents can straddle iter->pos:
-        */
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
-               iter->pos = k.k->p;
-       else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-               iter->pos = bkey_start_pos(k.k);
+       iter->pos = iter_pos;
 
        iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
                                iter->flags & BTREE_ITER_INTENT,
                                btree_iter_ip_allocated(iter));
-       BUG_ON(!iter->path->nodes_locked);
-out:
+
+       btree_path_set_should_be_locked(iter->path);
+out_no_locked:
        if (iter->update_path) {
-               BUG_ON(!(iter->update_path->nodes_locked & 1));
-               iter->update_path->should_be_locked = true;
+               if (iter->update_path->uptodate &&
+                   (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
+                       k = bkey_s_c_err(ret);
+               else
+                       btree_path_set_should_be_locked(iter->update_path);
        }
-       iter->path->should_be_locked = true;
 
        if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
                iter->pos.snapshot = iter->snapshot;
@@ -2455,6 +2099,100 @@ out:
        return k;
 }
 
+/**
+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
+ * to iterator's current position, returning keys from every level of the btree.
+ * For keys at different levels of the btree that compare equal, the key from
+ * the lower level (leaf) is returned first.
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bkey_s_c k;
+       int ret;
+
+       EBUG_ON(iter->path->cached);
+       bch2_btree_iter_verify(iter);
+       BUG_ON(iter->path->level < iter->min_depth);
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+       EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+       while (1) {
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+               if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
+                       k = bkey_s_c_err(ret);
+                       goto out_no_locked;
+               }
+
+               /* Already at end? */
+               if (!btree_path_node(iter->path, iter->path->level)) {
+                       k = bkey_s_c_null;
+                       goto out_no_locked;
+               }
+
+               k = btree_path_level_peek_all(trans->c,
+                               &iter->path->l[iter->path->level], &iter->k);
+
+               /* Check if we should go up to the parent node: */
+               if (!k.k ||
+                   (iter->advanced &&
+                    !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
+                       iter->pos = path_l(iter->path)->b->key.k.p;
+                       btree_path_set_level_up(trans, iter->path);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               /*
+                * Check if we should go back down to a leaf:
+                * If we're not in a leaf node, we only return the current key
+                * if it exactly matches iter->pos - otherwise we first have to
+                * go back to the leaf:
+                */
+               if (iter->path->level != iter->min_depth &&
+                   (iter->advanced ||
+                    !k.k ||
+                    bpos_cmp(iter->pos, k.k->p))) {
+                       btree_path_set_level_down(trans, iter->path, iter->min_depth);
+                       iter->pos = bpos_successor(iter->pos);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               /* Check if we should go to the next key: */
+               if (iter->path->level == iter->min_depth &&
+                   iter->advanced &&
+                   k.k &&
+                   !bpos_cmp(iter->pos, k.k->p)) {
+                       iter->pos = bpos_successor(iter->pos);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               if (iter->advanced &&
+                   iter->path->level == iter->min_depth &&
+                   bpos_cmp(k.k->p, iter->pos))
+                       iter->advanced = false;
+
+               BUG_ON(iter->advanced);
+               BUG_ON(!k.k);
+               break;
+       }
+
+       iter->pos = k.k->p;
+       btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+       bch2_btree_iter_verify(iter);
+
+       return k;
+}
+
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
@@ -2503,19 +2241,19 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                        /* ensure that iter->k is consistent with iter->pos: */
                        bch2_btree_iter_set_pos(iter, iter->pos);
                        k = bkey_s_c_err(ret);
-                       goto out;
+                       goto out_no_locked;
                }
 
-               k = btree_path_level_peek(trans->c, iter->path,
+               k = btree_path_level_peek(trans, iter->path,
                                          &iter->path->l[0], &iter->k);
                if (!k.k ||
                    ((iter->flags & BTREE_ITER_IS_EXTENTS)
                     ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
                     : bpos_cmp(k.k->p, search_key) > 0))
-                       k = btree_path_level_prev(trans->c, iter->path,
+                       k = btree_path_level_prev(trans, iter->path,
                                                  &iter->path->l[0], &iter->k);
 
-               btree_path_check_sort(trans, iter->path, 0);
+               bch2_btree_path_check_sort(trans, iter->path, 0);
 
                if (likely(k.k)) {
                        if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -2528,7 +2266,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                                 * that candidate
                                 */
                                if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
-                                       bch2_path_put(trans, iter->path,
+                                       bch2_path_put_nokeep(trans, iter->path,
                                                      iter->flags & BTREE_ITER_INTENT);
                                        iter->path = saved_path;
                                        saved_path = NULL;
@@ -2541,7 +2279,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                                                              iter->snapshot,
                                                              k.k->p.snapshot)) {
                                        if (saved_path)
-                                               bch2_path_put(trans, saved_path,
+                                               bch2_path_put_nokeep(trans, saved_path,
                                                      iter->flags & BTREE_ITER_INTENT);
                                        saved_path = btree_path_clone(trans, iter->path,
                                                                iter->flags & BTREE_ITER_INTENT);
@@ -2569,7 +2307,7 @@ got_key:
                        /* Start of btree: */
                        bch2_btree_iter_set_pos(iter, POS_MIN);
                        k = bkey_s_c_null;
-                       goto out;
+                       goto out_no_locked;
                }
        }
 
@@ -2581,10 +2319,11 @@ got_key:
 
        if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
                iter->pos.snapshot = iter->snapshot;
-out:
+
+       btree_path_set_should_be_locked(iter->path);
+out_no_locked:
        if (saved_path)
-               bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
-       iter->path->should_be_locked = true;
+               bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
@@ -2611,9 +2350,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(iter->path->level);
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
+       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+       EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
        /* extents can't span inode numbers: */
        if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2630,8 +2370,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                                        btree_iter_ip_allocated(iter));
 
        ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-       if (unlikely(ret))
-               return bkey_s_c_err(ret);
+       if (unlikely(ret)) {
+               k = bkey_s_c_err(ret);
+               goto out_no_locked;
+       }
 
        if ((iter->flags & BTREE_ITER_CACHED) ||
            !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
@@ -2647,29 +2389,38 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                }
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
-                   (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
-                   !bpos_cmp(next_update->k.p, iter->pos)) {
+                   (next_update = bch2_btree_journal_peek_slot(trans,
+                                       iter, iter->pos))) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
                        goto out;
                }
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
-                   (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+                   (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
                        if (!bkey_err(k))
                                iter->k = *k.k;
-                       goto out;
+                       /* We're not returning a key from iter->path: */
+                       goto out_no_locked;
                }
 
                k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+               if (unlikely(!k.k))
+                       goto out_no_locked;
        } else {
                struct bpos next;
 
+               EBUG_ON(iter->path->level);
+
                if (iter->flags & BTREE_ITER_INTENT) {
                        struct btree_iter iter2;
+                       struct bpos end = iter->pos;
+
+                       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+                               end.offset = U64_MAX;
 
                        bch2_trans_copy_iter(&iter2, iter);
-                       k = bch2_btree_iter_peek(&iter2);
+                       k = bch2_btree_iter_peek_upto(&iter2, end);
 
                        if (k.k && !bkey_err(k)) {
                                iter->k = iter2.k;
@@ -2680,11 +2431,14 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                        struct bpos pos = iter->pos;
 
                        k = bch2_btree_iter_peek(iter);
-                       iter->pos = pos;
+                       if (unlikely(bkey_err(k)))
+                               bch2_btree_iter_set_pos(iter, pos);
+                       else
+                               iter->pos = pos;
                }
 
                if (unlikely(bkey_err(k)))
-                       return k;
+                       goto out_no_locked;
 
                next = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
@@ -2706,8 +2460,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                }
        }
 out:
-       iter->path->should_be_locked = true;
-
+       btree_path_set_should_be_locked(iter->path);
+out_no_locked:
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
        ret = bch2_btree_iter_verify_ret(iter, k);
@@ -2759,8 +2513,14 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
        struct btree_path *path, *prev = NULL;
        unsigned i;
 
+       if (!bch2_debug_check_iterators)
+               return;
+
        trans_for_each_path_inorder(trans, path, i) {
-               BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+               if (prev && btree_path_cmp(prev, path) > 0) {
+                       bch2_dump_trans_paths_updates(trans);
+                       panic("trans paths out of order!\n");
+               }
                prev = path;
        }
 #endif
@@ -2777,8 +2537,27 @@ static inline void btree_path_swap(struct btree_trans *trans,
        btree_path_verify_sorted_ref(trans, r);
 }
 
-static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
-                                 int cmp)
+static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
+                                                  struct btree_path *path,
+                                                  int cmp)
+{
+       struct btree_path *n;
+       int cmp2;
+
+       EBUG_ON(!cmp);
+
+       while ((n = cmp < 0
+               ? prev_btree_path(trans, path)
+               : next_btree_path(trans, path)) &&
+              (cmp2 = btree_path_cmp(n, path)) &&
+              cmp2 != cmp)
+               btree_path_swap(trans, n, path);
+
+       btree_trans_verify_sorted(trans);
+}
+
+inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+                                      int cmp)
 {
        struct btree_path *n;
 
@@ -2834,6 +2613,11 @@ static inline void btree_path_list_add(struct btree_trans *trans,
 
        path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
 
+       if (trans->in_traverse_all &&
+           trans->traverse_all_idx != U8_MAX &&
+           trans->traverse_all_idx >= path->sorted_idx)
+               trans->traverse_all_idx++;
+
        array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
 
        for (i = path->sorted_idx; i < trans->nr_sorted; i++)
@@ -2848,7 +2632,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
                bch2_path_put(trans, iter->path,
                              iter->flags & BTREE_ITER_INTENT);
        if (iter->update_path)
-               bch2_path_put(trans, iter->update_path,
+               bch2_path_put_nokeep(trans, iter->update_path,
                              iter->flags & BTREE_ITER_INTENT);
        if (iter->key_cache_path)
                bch2_path_put(trans, iter->key_cache_path,
@@ -2858,15 +2642,21 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
        iter->key_cache_path = NULL;
 }
 
-static void __bch2_trans_iter_init(struct btree_trans *trans,
-                                  struct btree_iter *iter,
-                                  unsigned btree_id, struct bpos pos,
-                                  unsigned locks_want,
-                                  unsigned depth,
-                                  unsigned flags,
-                                  unsigned long ip)
+static inline void __bch2_trans_iter_init(struct btree_trans *trans,
+                                         struct btree_iter *iter,
+                                         unsigned btree_id, struct bpos pos,
+                                         unsigned locks_want,
+                                         unsigned depth,
+                                         unsigned flags,
+                                         unsigned long ip)
 {
-       EBUG_ON(trans->restarted);
+       if (unlikely(trans->restarted))
+               panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
+                     bch2_err_str(trans->restarted),
+                     (void *) trans->last_restarted_ip);
+
+       if (flags & BTREE_ITER_ALL_LEVELS)
+               flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
 
        if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
            btree_node_type_is_extents(btree_id))
@@ -2880,15 +2670,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
            btree_type_has_snapshots(btree_id))
                flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
-       if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+       if (trans->journal_replay_not_finished)
                flags |= BTREE_ITER_WITH_JOURNAL;
 
-       if (!btree_id_cached(trans->c, btree_id)) {
-               flags &= ~BTREE_ITER_CACHED;
-               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
-       } else if (!(flags & BTREE_ITER_CACHED))
-               flags |= BTREE_ITER_WITH_KEY_CACHE;
-
        iter->trans     = trans;
        iter->path      = NULL;
        iter->update_path = NULL;
@@ -2901,6 +2685,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
        iter->k.type    = KEY_TYPE_deleted;
        iter->k.p       = pos;
        iter->k.size    = 0;
+       iter->journal_idx = 0;
+       iter->journal_pos = POS_MIN;
 #ifdef CONFIG_BCACHEFS_DEBUG
        iter->ip_allocated = ip;
 #endif
@@ -2914,6 +2700,12 @@ void bch2_trans_iter_init(struct btree_trans *trans,
                          unsigned btree_id, struct bpos pos,
                          unsigned flags)
 {
+       if (!btree_id_cached(trans->c, btree_id)) {
+               flags &= ~BTREE_ITER_CACHED;
+               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+       } else if (!(flags & BTREE_ITER_CACHED))
+               flags |= BTREE_ITER_WITH_KEY_CACHE;
+
        __bch2_trans_iter_init(trans, iter, btree_id, pos,
                               0, 0, flags, _RET_IP_);
 }
@@ -2946,36 +2738,34 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
        dst->key_cache_path = NULL;
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-       size_t new_top = trans->mem_top + size;
+       unsigned new_top = trans->mem_top + size;
+       size_t old_bytes = trans->mem_bytes;
+       size_t new_bytes = roundup_pow_of_two(new_top);
+       void *new_mem;
        void *p;
 
-       if (new_top > trans->mem_bytes) {
-               size_t old_bytes = trans->mem_bytes;
-               size_t new_bytes = roundup_pow_of_two(new_top);
-               void *new_mem;
+       trans->mem_max = max(trans->mem_max, new_top);
 
-               WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+       WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
 
-               new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
-               if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-                       new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-                       new_bytes = BTREE_TRANS_MEM_MAX;
-                       kfree(trans->mem);
-               }
+       new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+       if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+               new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+               new_bytes = BTREE_TRANS_MEM_MAX;
+               kfree(trans->mem);
+       }
 
-               if (!new_mem)
-                       return ERR_PTR(-ENOMEM);
+       if (!new_mem)
+               return ERR_PTR(-ENOMEM);
 
-               trans->mem = new_mem;
-               trans->mem_bytes = new_bytes;
+       trans->mem = new_mem;
+       trans->mem_bytes = new_bytes;
 
-               if (old_bytes) {
-                       trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
-                       btree_trans_restart(trans);
-                       return ERR_PTR(-EINTR);
-               }
+       if (old_bytes) {
+               trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
        }
 
        p = trans->mem + trans->mem_top;
@@ -2988,30 +2778,23 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
- * While iterating over nodes or updating nodes a attempt to lock a btree
- * node may return EINTR when the trylock fails. When this occurs
- * bch2_trans_begin() should be called and the transaction retried.
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
  */
-void bch2_trans_begin(struct btree_trans *trans)
+u32 bch2_trans_begin(struct btree_trans *trans)
 {
-       struct btree_insert_entry *i;
        struct btree_path *path;
 
-       trans_for_each_update(trans, i)
-               __btree_path_put(i->path, true);
+       bch2_trans_reset_updates(trans);
 
-       memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-       trans->extra_journal_res        = 0;
-       trans->nr_updates               = 0;
+       trans->restart_count++;
        trans->mem_top                  = 0;
 
-       trans->hooks                    = NULL;
-       trans->extra_journal_entries    = NULL;
-       trans->extra_journal_entry_u64s = 0;
-
        if (trans->fs_usage_deltas) {
                trans->fs_usage_deltas->used = 0;
-               memset(&trans->fs_usage_deltas->memset_start, 0,
+               memset((void *) trans->fs_usage_deltas +
+                      offsetof(struct replicas_delta_list, memset_start), 0,
                       (void *) &trans->fs_usage_deltas->memset_end -
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
@@ -3019,6 +2802,14 @@ void bch2_trans_begin(struct btree_trans *trans)
        trans_for_each_path(trans, path) {
                path->should_be_locked = false;
 
+               /*
+                * If the transaction wasn't restarted, we're presuming to be
+                * doing something new: dont keep iterators excpt the ones that
+                * are in use - except for the subvolumes btree:
+                */
+               if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+                       path->preserve = false;
+
                /*
                 * XXX: we probably shouldn't be doing this if the transaction
                 * was restarted, but currently we still overflow transaction
@@ -3026,16 +2817,32 @@ void bch2_trans_begin(struct btree_trans *trans)
                 */
                if (!path->ref && !path->preserve)
                        __bch2_path_free(trans, path);
-               else if (!path->ref)
+               else
                        path->preserve = false;
        }
 
-       bch2_trans_cond_resched(trans);
+       if (!trans->restarted &&
+           (need_resched() ||
+            local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+               bch2_trans_unlock(trans);
+               cond_resched();
+               bch2_trans_relock(trans);
+       }
 
+       trans->last_restarted_ip = _RET_IP_;
        if (trans->restarted)
                bch2_btree_path_traverse_all(trans);
 
-       trans->restarted = false;
+       trans->last_begin_time = local_clock();
+       return trans->restart_count;
+}
+
+void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
+{
+       if (trans_was_restarted(trans, restart_count))
+               panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+                     trans->restart_count, restart_count,
+                     (void *) trans->last_restarted_ip);
 }
 
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
@@ -3047,7 +2854,7 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
        BUG_ON(trans->used_mempool);
 
 #ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
+       p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
 #endif
        if (!p)
                p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
@@ -3056,35 +2863,71 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
        trans->updates          = p; p += updates_bytes;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
-                      unsigned expected_nr_iters,
-                      size_t expected_mem_bytes,
-                      const char *fn)
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+               if (!bch2_btree_transaction_fns[i] ||
+                   bch2_btree_transaction_fns[i] == fn) {
+                       bch2_btree_transaction_fns[i] = fn;
+                       return i;
+               }
+
+       pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+       return i;
+}
+
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
        __acquires(&c->btree_trans_barrier)
 {
+       struct btree_transaction_stats *s;
+       struct btree_trans *pos;
+
        BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
 
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
-       trans->fn               = fn;
+       trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+               ? bch2_btree_transaction_fns[fn_idx] : NULL;
+       trans->last_begin_time  = local_clock();
+       trans->fn_idx           = fn_idx;
+       trans->locking_wait.task = current;
+       trans->journal_replay_not_finished =
+               !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+       closure_init_stack(&trans->ref);
 
        bch2_trans_alloc_paths(trans, c);
 
-       if (expected_mem_bytes) {
-               trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
-               trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+       s = btree_trans_stats(trans);
+       if (s) {
+               unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+               trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
 
                if (!unlikely(trans->mem)) {
                        trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
                        trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+               } else {
+                       trans->mem_bytes = expected_mem_bytes;
                }
+
+               trans->nr_max_paths = s->nr_max_paths;
        }
 
        trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
-       trans->pid = current->pid;
        mutex_lock(&c->btree_trans_lock);
-       list_add(&trans->list, &c->btree_trans_list);
+       list_for_each_entry(pos, &c->btree_trans_list, list) {
+               if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+                       list_add_tail(&trans->list, &pos->list);
+                       goto list_add_done;
+               }
+       }
+       list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
        mutex_unlock(&c->btree_trans_lock);
 }
 
@@ -3115,9 +2958,15 @@ void bch2_trans_exit(struct btree_trans *trans)
 {
        struct btree_insert_entry *i;
        struct bch_fs *c = trans->c;
+       struct btree_transaction_stats *s = btree_trans_stats(trans);
 
        bch2_trans_unlock(trans);
 
+       closure_sync(&trans->ref);
+
+       if (s)
+               s->max_mem = max(s->max_mem, trans->mem_max);
+
        trans_for_each_update(trans, i)
                __btree_path_put(i->path, true);
        trans->nr_updates               = 0;
@@ -3132,6 +2981,8 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
+       kfree(trans->extra_journal_entries.data);
+
        if (trans->fs_usage_deltas) {
                if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
                    REPLICAS_DELTA_LIST_MAX)
@@ -3161,86 +3012,84 @@ void bch2_trans_exit(struct btree_trans *trans)
 }
 
 static void __maybe_unused
-bch2_btree_path_node_to_text(struct printbuf *out,
-                            struct btree_bkey_cached_common *_b,
-                            bool cached)
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+                                     struct btree_bkey_cached_common *b)
 {
-       pr_buf(out, "    l=%u %s:",
-              _b->level, bch2_btree_ids[_b->btree_id]);
-       bch2_bpos_to_text(out, btree_node_pos(_b, cached));
-}
+       struct six_lock_count c = six_lock_counts(&b->lock);
+       struct task_struct *owner;
+       pid_t pid;
 
-static bool trans_has_locks(struct btree_trans *trans)
-{
-       struct btree_path *path;
+       rcu_read_lock();
+       owner = READ_ONCE(b->lock.owner);
+       pid = owner ? owner->pid : 0;
+       rcu_read_unlock();
 
-       trans_for_each_path(trans, path)
-               if (path->nodes_locked)
-                       return true;
-       return false;
+       prt_tab(out);
+       prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+                  b->level, bch2_btree_ids[b->btree_id]);
+       bch2_bpos_to_text(out, btree_node_pos(b));
+
+       prt_tab(out);
+       prt_printf(out, " locks %u:%u:%u held by pid %u",
+                  c.n[0], c.n[1], c.n[2], pid);
 }
 
-void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 {
-       struct btree_trans *trans;
        struct btree_path *path;
-       struct btree *b;
+       struct btree_bkey_cached_common *b;
        static char lock_types[] = { 'r', 'i', 'w' };
        unsigned l;
 
-       mutex_lock(&c->btree_trans_lock);
-       list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (!trans_has_locks(trans))
-                       continue;
+       if (!out->nr_tabstops) {
+               printbuf_tabstop_push(out, 16);
+               printbuf_tabstop_push(out, 32);
+       }
 
-               pr_buf(out, "%i %s\n", trans->pid, trans->fn);
+       prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
 
-               trans_for_each_path(trans, path) {
-                       if (!path->nodes_locked)
-                               continue;
+       trans_for_each_path(trans, path) {
+               if (!path->nodes_locked)
+                       continue;
 
-                       pr_buf(out, "  path %u %c l=%u %s:",
-                              path->idx,
-                              path->cached ? 'c' : 'b',
-                              path->level,
-                              bch2_btree_ids[path->btree_id]);
-                       bch2_bpos_to_text(out, path->pos);
-                       pr_buf(out, "\n");
-
-                       for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-                               if (btree_node_locked(path, l)) {
-                                       pr_buf(out, "    %s l=%u ",
-                                              btree_node_intent_locked(path, l) ? "i" : "r", l);
-                                       bch2_btree_path_node_to_text(out,
-                                                       (void *) path->l[l].b,
-                                                       path->cached);
-                                       pr_buf(out, "\n");
-                               }
+               prt_printf(out, "  path %u %c l=%u %s:",
+                      path->idx,
+                      path->cached ? 'c' : 'b',
+                      path->level,
+                      bch2_btree_ids[path->btree_id]);
+               bch2_bpos_to_text(out, path->pos);
+               prt_newline(out);
+
+               for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+                       if (btree_node_locked(path, l) &&
+                           !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+                               prt_printf(out, "    %c l=%u ",
+                                          lock_types[btree_node_locked_type(path, l)], l);
+                               bch2_btree_bkey_cached_common_to_text(out, b);
+                               prt_newline(out);
                        }
                }
+       }
 
-               b = READ_ONCE(trans->locking);
-               if (b) {
-                       path = &trans->paths[trans->locking_path_idx];
-                       pr_buf(out, "  locking path %u %c l=%u %c %s:",
-                              trans->locking_path_idx,
-                              path->cached ? 'c' : 'b',
-                              trans->locking_level,
-                              lock_types[trans->locking_lock_type],
-                              bch2_btree_ids[trans->locking_btree_id]);
-                       bch2_bpos_to_text(out, trans->locking_pos);
-
-                       pr_buf(out, " node ");
-                       bch2_btree_path_node_to_text(out,
-                                       (void *) b, path->cached);
-                       pr_buf(out, "\n");
-               }
+       b = READ_ONCE(trans->locking);
+       if (b) {
+               prt_str(out, "  want");
+               prt_newline(out);
+               prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
+               bch2_btree_bkey_cached_common_to_text(out, b);
+               prt_newline(out);
        }
-       mutex_unlock(&c->btree_trans_lock);
 }
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+       struct btree_transaction_stats *s;
+
+       for (s = c->btree_transaction_stats;
+            s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+            s++)
+               kfree(s->max_paths_text);
+
        if (c->btree_trans_barrier_initialized)
                cleanup_srcu_struct(&c->btree_trans_barrier);
        mempool_exit(&c->btree_trans_mem_pool);
@@ -3249,9 +3098,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
-       unsigned nr = BTREE_ITER_MAX;
+       unsigned i, nr = BTREE_ITER_MAX;
        int ret;
 
+       for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
+               mutex_init(&c->btree_transaction_stats[i].lock);
+
        INIT_LIST_HEAD(&c->btree_trans_list);
        mutex_init(&c->btree_trans_lock);
 
index 759c7b52f4a24f34ddf061735341c27aee49bee1..0775cfa2be9a37a0a042bd56f52217fc3e5d0ffc 100644 (file)
@@ -5,6 +5,8 @@
 #include "bset.h"
 #include "btree_types.h"
 
+#include <trace/events/bcachefs.h>
+
 static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
        path->ref++;
@@ -70,11 +72,16 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
        return &trans->paths[idx];
 }
 
-#define trans_for_each_path(_trans, _path)                             \
-       for (_path = __trans_next_path((_trans), 0);                    \
+void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+
+#define trans_for_each_path_from(_trans, _path, _start)                        \
+       for (_path = __trans_next_path((_trans), _start);               \
             (_path);                                                   \
             _path = __trans_next_path((_trans), (_path)->idx + 1))
 
+#define trans_for_each_path(_trans, _path)                             \
+       trans_for_each_path_from(_trans, _path, 0)
+
 static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
        unsigned idx = path ? path->sorted_idx + 1 : 0;
@@ -124,9 +131,20 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
             _path = __trans_next_path_with_node((_trans), (_b),        \
                                                 (_path)->idx + 1))
 
-struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
                         bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+                        struct btree_path *path, bool intent,
+                        unsigned long ip)
+{
+       if (path->ref > 1 || path->preserve)
+               path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+       path->should_be_locked = false;
+       return path;
+}
+
 struct btree_path * __must_check
 bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
                        struct bpos, bool, unsigned long);
@@ -136,14 +154,18 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
                                 unsigned, unsigned, unsigned, unsigned long);
 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+                                       struct btree_iter *, struct bpos);
+
+inline void bch2_btree_path_level_init(struct btree_trans *,
+                                      struct btree_path *, struct btree *);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_trans_verify_locks(struct btree_trans *);
 void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
                            struct bpos, bool);
 #else
 static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
 static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
                                          struct bpos pos, bool key_cache) {}
 #endif
@@ -154,46 +176,50 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
                              struct btree *, struct btree_node_iter *,
                              struct bkey_packed *, unsigned, unsigned);
 
-bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 
 void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
-bool bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
 
-__always_inline
-static inline int btree_trans_restart(struct btree_trans *trans)
+static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-       trans->restarted = true;
-       bch2_trans_unlock(trans);
-       return -EINTR;
+       return restart_count != trans->restart_count;
 }
 
-bool bch2_btree_node_upgrade(struct btree_trans *,
-                            struct btree_path *, unsigned);
-
-bool __bch2_btree_path_upgrade(struct btree_trans *,
-                              struct btree_path *, unsigned);
+void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
 
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
-                                          struct btree_path *path,
-                                          unsigned new_locks_want)
+__always_inline
+static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
 {
-       new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+       BUG_ON(err <= 0);
+       BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
 
-       return path->locks_want < new_locks_want
-               ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
-               : path->uptodate == BTREE_ITER_UPTODATE;
+       trans->restarted = err;
+       return -err;
 }
 
-void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans, int err)
+{
+       btree_trans_restart_nounlock(trans, err);
+       return -err;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+                            struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
 
-static inline void bch2_btree_path_downgrade(struct btree_path *path)
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+                                            struct btree_path *path)
 {
        unsigned new_locks_want = path->level + !!path->intent_ref;
 
        if (path->locks_want > new_locks_want)
-               __bch2_btree_path_downgrade(path, new_locks_want);
+               __bch2_btree_path_downgrade(trans, path, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
@@ -207,9 +233,16 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+       return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
@@ -267,11 +300,28 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
 
 static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
-       iter->path->preserve = false;
+       if (!iter->trans->restarted)
+               iter->path->preserve = false;
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_begin(struct btree_trans *);
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+       unsigned new_top = trans->mem_top + size;
+       void *p = trans->mem + trans->mem_top;
+
+       if (likely(new_top <= trans->mem_bytes)) {
+               trans->mem_top += size;
+               memset(p, 0, size);
+               return p;
+       } else {
+               return __bch2_trans_kmalloc(trans, size);
+
+       }
+}
+
+u32 bch2_trans_begin(struct btree_trans *);
 
 static inline struct btree *
 __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
@@ -279,7 +329,7 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter
        struct btree *b;
 
        while (b = bch2_btree_iter_peek_node(iter),
-              PTR_ERR_OR_ZERO(b) == -EINTR)
+              bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
                bch2_trans_begin(trans);
 
        return b;
@@ -303,18 +353,44 @@ static inline int bkey_err(struct bkey_s_c k)
        return PTR_ERR_OR_ZERO(k.k);
 }
 
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+                                                            unsigned flags)
+{
+       BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+       return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+                                               bch2_btree_iter_peek_prev(iter);
+}
+
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
-                                                    unsigned flags)
+                                                       unsigned flags)
 {
-       return flags & BTREE_ITER_SLOTS
-               ? bch2_btree_iter_peek_slot(iter)
-               : bch2_btree_iter_peek(iter);
+       return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+               flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+                                               bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+                                                            struct bpos end,
+                                                            unsigned flags)
+{
+       if (!(flags & BTREE_ITER_SLOTS))
+               return bch2_btree_iter_peek_upto(iter, end);
+
+       if (bkey_cmp(iter->pos, end) > 0)
+               return bkey_s_c_null;
+
+       return bch2_btree_iter_peek_slot(iter);
 }
 
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
-       return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
-               ? -EINTR : 0;
+       if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+               trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+       }
+
+       return 0;
 }
 
 static inline struct bkey_s_c
@@ -325,12 +401,124 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
        while (btree_trans_too_many_iters(trans) ||
               (k = bch2_btree_iter_peek_type(iter, flags),
-               bkey_err(k) == -EINTR))
+               bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
                bch2_trans_begin(trans);
 
        return k;
 }
 
+#define lockrestart_do(_trans, _do)                                    \
+({                                                                     \
+       u32 _restart_count;                                             \
+       int _ret;                                                       \
+                                                                       \
+       do {                                                            \
+               _restart_count = bch2_trans_begin(_trans);              \
+               _ret = (_do);                                           \
+       } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));  \
+                                                                       \
+       if (!_ret)                                                      \
+               bch2_trans_verify_not_restarted(_trans, _restart_count);\
+                                                                       \
+       _ret;                                                           \
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ *  - We don't call bch2_trans_begin() unless we had a transaction restart
+ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ *  transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do)                             \
+({                                                                     \
+       u32 _restart_count, _orig_restart_count;                        \
+       int _ret;                                                       \
+                                                                       \
+       _restart_count = _orig_restart_count = (_trans)->restart_count; \
+                                                                       \
+       while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+               _restart_count = bch2_trans_begin(_trans);              \
+                                                                       \
+       if (!_ret)                                                      \
+               bch2_trans_verify_not_restarted(_trans, _restart_count);\
+                                                                       \
+       if (!_ret && trans_was_restarted(_trans, _orig_restart_count))  \
+               _ret = -BCH_ERR_transaction_restart_nested;             \
+                                                                       \
+       _ret;                                                           \
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id,                  \
+                           _start, _flags, _k, _do)                    \
+({                                                                     \
+       int _ret = 0;                                                   \
+                                                                       \
+       bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
+                            (_start), (_flags));                       \
+                                                                       \
+       while (1) {                                                     \
+               u32 _restart_count = bch2_trans_begin(_trans);          \
+               (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));   \
+               if (!(_k).k) {                                          \
+                       _ret = 0;                                       \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               _ret = bkey_err(_k) ?: (_do);                           \
+               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+                       continue;                                       \
+               if (_ret)                                               \
+                       break;                                          \
+               bch2_trans_verify_not_restarted(_trans, _restart_count);\
+               if (!bch2_btree_iter_advance(&(_iter)))                 \
+                       break;                                          \
+       }                                                               \
+                                                                       \
+       bch2_trans_iter_exit((_trans), &(_iter));                       \
+       _ret;                                                           \
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id,           \
+                                  _start, _flags, _k, _do)             \
+({                                                                     \
+       int _ret = 0;                                                   \
+                                                                       \
+       bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
+                            (_start), (_flags));                       \
+                                                                       \
+       while (1) {                                                     \
+               u32 _restart_count = bch2_trans_begin(_trans);          \
+               (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+               if (!(_k).k) {                                          \
+                       _ret = 0;                                       \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               _ret = bkey_err(_k) ?: (_do);                           \
+               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+                       continue;                                       \
+               if (_ret)                                               \
+                       break;                                          \
+               bch2_trans_verify_not_restarted(_trans, _restart_count);\
+               if (!bch2_btree_iter_rewind(&(_iter)))                  \
+                       break;                                          \
+       }                                                               \
+                                                                       \
+       bch2_trans_iter_exit((_trans), &(_iter));                       \
+       _ret;                                                           \
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id,            \
+                                 _start, _iter_flags, _k,              \
+                                 _disk_res, _journal_seq, _commit_flags,\
+                                 _do)                                  \
+       for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+                           (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+                                       (_journal_seq), (_commit_flags)))
+
 #define for_each_btree_key(_trans, _iter, _btree_id,                   \
                           _start, _flags, _k, _ret)                    \
        for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
@@ -347,6 +535,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
             bch2_btree_iter_advance(&(_iter)))
 
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,    \
+                          _start, _end, _flags, _k, _ret)              \
+       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
+                                 (_start), (_flags));                  \
+            (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+            !((_ret) = bkey_err(_k)) && (_k).k;                        \
+            bch2_btree_iter_advance(&(_iter)))
+
 #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)   \
        for (;                                                          \
             (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
@@ -361,14 +557,28 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 /* new multiple iterator interface: */
 
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
-                      unsigned, size_t, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
 void bch2_trans_exit(struct btree_trans *);
 
-#define bch2_trans_init(...)   __bch2_trans_init(__VA_ARGS__, __func__)
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem)                   \
+do {                                                                   \
+       static unsigned trans_fn_idx;                                   \
+                                                                       \
+       if (unlikely(!trans_fn_idx))                                    \
+               trans_fn_idx = bch2_trans_get_fn_idx(__func__);         \
+                                                                       \
+       __bch2_trans_init(_trans, _c, trans_fn_idx);                    \
+} while (0)
 
-void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
 void bch2_fs_btree_iter_exit(struct bch_fs *);
 int bch2_fs_btree_iter_init(struct bch_fs *);
index 928aab61bcf6c25877700e61ec1e088523945653..cd52dd5a2890e44263f7f2925bddb8951cbcba8b 100644 (file)
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "btree_cache.h"
@@ -5,6 +6,7 @@
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+       return id == BTREE_ID_subvolumes;
+}
+
 static struct kmem_cache *bch2_key_cache;
 
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -83,26 +90,185 @@ static void bkey_cached_free(struct btree_key_cache *bc,
        ck->btree_trans_barrier_seq =
                start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-       list_move_tail(&ck->list, &bc->freed);
-       bc->nr_freed++;
+       if (ck->c.lock.readers)
+               list_move_tail(&ck->list, &bc->freed_pcpu);
+       else
+               list_move_tail(&ck->list, &bc->freed_nonpcpu);
+       atomic_long_inc(&bc->nr_freed);
+
+       kfree(ck->k);
+       ck->k           = NULL;
+       ck->u64s        = 0;
+
+       six_unlock_write(&ck->c.lock);
+       six_unlock_intent(&ck->c.lock);
+}
+
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+                                                  struct bkey_cached *ck)
+{
+       struct bkey_cached *pos;
+
+       list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+               if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+                                pos->btree_trans_barrier_seq)) {
+                       list_move(&ck->list, &pos->list);
+                       return;
+               }
+       }
+
+       list_move(&ck->list, &bc->freed_nonpcpu);
+}
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+                                        struct bkey_cached *ck)
+{
+       struct btree_key_cache_freelist *f;
+       bool freed = false;
+
+       BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+       if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+               preempt_disable();
+               f = this_cpu_ptr(bc->pcpu_freed);
+
+               if (f->nr < ARRAY_SIZE(f->objs)) {
+                       f->objs[f->nr++] = ck;
+                       freed = true;
+               }
+               preempt_enable();
+
+               if (!freed) {
+                       mutex_lock(&bc->lock);
+                       preempt_disable();
+                       f = this_cpu_ptr(bc->pcpu_freed);
+
+                       while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+                               struct bkey_cached *ck2 = f->objs[--f->nr];
+
+                               __bkey_cached_move_to_freelist_ordered(bc, ck2);
+                       }
+                       preempt_enable();
+
+                       __bkey_cached_move_to_freelist_ordered(bc, ck);
+                       mutex_unlock(&bc->lock);
+               }
+#else
+               mutex_lock(&bc->lock);
+               list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               mutex_unlock(&bc->lock);
+#endif
+       } else {
+               mutex_lock(&bc->lock);
+               list_move_tail(&ck->list, &bc->freed_pcpu);
+               mutex_unlock(&bc->lock);
+       }
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+                                 struct bkey_cached *ck)
+{
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+       ck->btree_trans_barrier_seq =
+               start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+       list_del_init(&ck->list);
+       atomic_long_inc(&bc->nr_freed);
 
        kfree(ck->k);
        ck->k           = NULL;
        ck->u64s        = 0;
 
+       bkey_cached_move_to_freelist(bc, ck);
+
        six_unlock_write(&ck->c.lock);
        six_unlock_intent(&ck->c.lock);
 }
 
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 {
-       struct bkey_cached *ck;
+       struct bch_fs *c = trans->c;
+       struct btree_key_cache *bc = &c->btree_key_cache;
+       struct bkey_cached *ck = NULL;
+       struct btree_key_cache_freelist *f;
+       bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+
+       if (!pcpu_readers) {
+#ifdef __KERNEL__
+               preempt_disable();
+               f = this_cpu_ptr(bc->pcpu_freed);
+               if (f->nr)
+                       ck = f->objs[--f->nr];
+               preempt_enable();
+
+               if (!ck) {
+                       mutex_lock(&bc->lock);
+                       preempt_disable();
+                       f = this_cpu_ptr(bc->pcpu_freed);
+
+                       while (!list_empty(&bc->freed_nonpcpu) &&
+                              f->nr < ARRAY_SIZE(f->objs) / 2) {
+                               ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+                               list_del_init(&ck->list);
+                               f->objs[f->nr++] = ck;
+                       }
 
+                       ck = f->nr ? f->objs[--f->nr] : NULL;
+                       preempt_enable();
+                       mutex_unlock(&bc->lock);
+               }
+#else
+               mutex_lock(&bc->lock);
+               if (!list_empty(&bc->freed_nonpcpu)) {
+                       ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+                       list_del_init(&ck->list);
+               }
+               mutex_unlock(&bc->lock);
+#endif
+       } else {
+               mutex_lock(&bc->lock);
+               if (!list_empty(&bc->freed_pcpu)) {
+                       ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+                       list_del_init(&ck->list);
+               }
+               mutex_unlock(&bc->lock);
+       }
+
+       if (ck) {
+               int ret;
+
+               ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+               if (unlikely(ret)) {
+                       bkey_cached_move_to_freelist(bc, ck);
+                       return ERR_PTR(ret);
+               }
+
+               path->l[0].b = (void *) ck;
+               path->l[0].lock_seq = ck->c.lock.state.seq;
+               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+
+               ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+               if (unlikely(ret)) {
+                       btree_node_unlock(trans, path, 0);
+                       bkey_cached_move_to_freelist(bc, ck);
+                       return ERR_PTR(ret);
+               }
+
+               return ck;
+       }
+
+       /* GFP_NOFS because we're holding btree locks: */
        ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
        if (likely(ck)) {
                INIT_LIST_HEAD(&ck->list);
-               six_lock_init(&ck->c.lock);
+               __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+               if (pcpu_readers)
+                       six_lock_pcpu_alloc(&ck->c.lock);
+
+               ck->c.cached = true;
                BUG_ON(!six_trylock_intent(&ck->c.lock));
                BUG_ON(!six_trylock_write(&ck->c.lock));
                return ck;
@@ -120,15 +286,6 @@ bkey_cached_reuse(struct btree_key_cache *c)
        unsigned i;
 
        mutex_lock(&c->lock);
-       list_for_each_entry_reverse(ck, &c->freed, list)
-               if (bkey_cached_lock_for_evict(ck)) {
-                       c->nr_freed--;
-                       list_del(&ck->list);
-                       mutex_unlock(&c->lock);
-                       return ck;
-               }
-       mutex_unlock(&c->lock);
-
        rcu_read_lock();
        tbl = rht_dereference_rcu(c->table.tbl, &c->table);
        for (i = 0; i < tbl->size; i++)
@@ -136,46 +293,47 @@ bkey_cached_reuse(struct btree_key_cache *c)
                        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
                            bkey_cached_lock_for_evict(ck)) {
                                bkey_cached_evict(c, ck);
-                               rcu_read_unlock();
-                               return ck;
+                               goto out;
                        }
                }
+       ck = NULL;
+out:
        rcu_read_unlock();
-
-       return NULL;
+       mutex_unlock(&c->lock);
+       return ck;
 }
 
 static struct bkey_cached *
-btree_key_cache_create(struct bch_fs *c,
-                      enum btree_id btree_id,
-                      struct bpos pos)
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 {
+       struct bch_fs *c = trans->c;
        struct btree_key_cache *bc = &c->btree_key_cache;
        struct bkey_cached *ck;
        bool was_new = true;
 
-       ck = bkey_cached_alloc(bc);
+       ck = bkey_cached_alloc(trans, path);
+       if (IS_ERR(ck))
+               return ck;
 
        if (unlikely(!ck)) {
                ck = bkey_cached_reuse(bc);
                if (unlikely(!ck)) {
                        bch_err(c, "error allocating memory for key cache item, btree %s",
-                               bch2_btree_ids[btree_id]);
+                               bch2_btree_ids[path->btree_id]);
                        return ERR_PTR(-ENOMEM);
                }
 
+               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
                was_new = false;
+       } else {
+               if (path->btree_id == BTREE_ID_subvolumes)
+                       six_lock_pcpu_alloc(&ck->c.lock);
        }
 
-       if (btree_id == BTREE_ID_subvolumes)
-               six_lock_pcpu_alloc(&ck->c.lock);
-       else
-               six_lock_pcpu_free(&ck->c.lock);
-
        ck->c.level             = 0;
-       ck->c.btree_id          = btree_id;
-       ck->key.btree_id        = btree_id;
-       ck->key.pos             = pos;
+       ck->c.btree_id          = path->btree_id;
+       ck->key.btree_id        = path->btree_id;
+       ck->key.pos             = path->pos;
        ck->valid               = false;
        ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 
@@ -187,11 +345,10 @@ btree_key_cache_create(struct bch_fs *c,
                if (likely(was_new)) {
                        six_unlock_write(&ck->c.lock);
                        six_unlock_intent(&ck->c.lock);
+                       mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
                        kfree(ck);
                } else {
-                       mutex_lock(&bc->lock);
-                       bkey_cached_free(bc, ck);
-                       mutex_unlock(&bc->lock);
+                       bkey_cached_free_fast(bc, ck);
                }
 
                return NULL;
@@ -224,9 +381,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
        k = bch2_btree_path_peek_slot(path, &u);
 
        if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-               trace_trans_restart_relock_key_cache_fill(trans->fn,
-                               _THIS_IP_, ck_path->btree_id, &ck_path->pos);
-               ret = btree_trans_restart(trans);
+               trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
                goto err;
        }
 
@@ -236,6 +392,13 @@ static int btree_key_cache_fill(struct btree_trans *trans,
         */
        new_u64s = k.k->u64s + 1;
 
+       /*
+        * Allocate some extra space so that the transaction commit path is less
+        * likely to have to reallocate, since that requires a transaction
+        * restart:
+        */
+       new_u64s = min(256U, (new_u64s * 3) / 2);
+
        if (new_u64s > ck->u64s) {
                new_u64s = roundup_pow_of_two(new_u64s);
                new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
@@ -247,11 +410,12 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                }
        }
 
-       /*
-        * XXX: not allowed to be holding read locks when we take a write lock,
-        * currently
-        */
-       bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+       ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+       if (ret) {
+               kfree(new_k);
+               goto err;
+       }
+
        if (new_k) {
                kfree(ck->k);
                ck->u64s = new_u64s;
@@ -269,18 +433,9 @@ err:
        return ret;
 }
 
-static int bkey_cached_check_fn(struct six_lock *lock, void *p)
-{
-       struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
-       const struct btree_path *path = p;
-
-       return ck->key.btree_id == path->btree_id &&
-               !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
-}
-
-__flatten
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
-                                   unsigned flags)
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+                                        unsigned flags)
 {
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck;
@@ -297,32 +452,24 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 retry:
        ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
        if (!ck) {
-               if (flags & BTREE_ITER_CACHED_NOCREATE) {
-                       path->l[0].b = NULL;
-                       return 0;
-               }
-
-               ck = btree_key_cache_create(c, path->btree_id, path->pos);
+               ck = btree_key_cache_create(trans, path);
                ret = PTR_ERR_OR_ZERO(ck);
                if (ret)
                        goto err;
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
                path->locks_want = 1;
        } else {
                enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-               if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
-                                    lock_want,
-                                    bkey_cached_check_fn, path, _THIS_IP_)) {
-                       if (!trans->restarted)
-                               goto retry;
-
-                       ret = -EINTR;
+               ret = btree_node_lock(trans, path, (void *) ck, 0,
+                                     lock_want, _THIS_IP_);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto err;
-               }
+
+               BUG_ON(ret);
 
                if (ck->key.btree_id != path->btree_id ||
                    bpos_cmp(ck->key.pos, path->pos)) {
@@ -330,17 +477,21 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0, lock_want);
        }
 
        path->l[0].lock_seq     = ck->c.lock.state.seq;
        path->l[0].b            = (void *) ck;
 fill:
-       if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+       if (!ck->valid) {
+               /*
+                * Using the underscore version because we haven't set
+                * path->uptodate yet:
+                */
                if (!path->locks_want &&
                    !__bch2_btree_path_upgrade(trans, path, 1)) {
-                       trace_transaction_restart_ip(trans->fn, _THIS_IP_);
-                       ret = btree_trans_restart(trans);
+                       trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
                        goto err;
                }
 
@@ -353,17 +504,72 @@ fill:
                set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
        path->uptodate = BTREE_ITER_UPTODATE;
+       BUG_ON(!ck->valid);
        BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
        return ret;
 err:
-       if (ret != -EINTR) {
-               btree_node_unlock(path, 0);
-               path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+       if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+               btree_node_unlock(trans, path, 0);
+               path->l[0].b = ERR_PTR(ret);
        }
        return ret;
 }
 
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+                                   unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_cached *ck;
+       int ret = 0;
+
+       EBUG_ON(path->level);
+
+       path->l[1].b = NULL;
+
+       if (bch2_btree_node_relock(trans, path, 0)) {
+               ck = (void *) path->l[0].b;
+               goto fill;
+       }
+retry:
+       ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+       if (!ck) {
+               return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+       } else {
+               enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+               ret = btree_node_lock(trans, path, (void *) ck, 0,
+                                     lock_want, _THIS_IP_);
+               EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+               if (ret)
+                       return ret;
+
+               if (ck->key.btree_id != path->btree_id ||
+                   bpos_cmp(ck->key.pos, path->pos)) {
+                       six_unlock_type(&ck->c.lock, lock_want);
+                       goto retry;
+               }
+
+               mark_btree_node_locked(trans, path, 0, lock_want);
+       }
+
+       path->l[0].lock_seq     = ck->c.lock.state.seq;
+       path->l[0].b            = (void *) ck;
+fill:
+       if (!ck->valid)
+               return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+       if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+               set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+       path->uptodate = BTREE_ITER_UPTODATE;
+       EBUG_ON(!ck->valid);
+       EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+       return ret;
+}
+
 static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                     struct bkey_cached_key key,
                                     u64 journal_seq,
@@ -382,8 +588,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                             BTREE_ITER_ALL_SNAPSHOTS);
        bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
                             BTREE_ITER_CACHED|
-                            BTREE_ITER_CACHED_NOFILL|
-                            BTREE_ITER_CACHED_NOCREATE|
                             BTREE_ITER_INTENT);
        b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 
@@ -410,7 +614,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
         * Since journal reclaim depends on us making progress here, and the
         * allocator/copygc depend on journal reclaim making progress, we need
         * to be using alloc reserves:
-        * */
+        */
        ret   = bch2_btree_iter_traverse(&b_iter) ?:
                bch2_trans_update(trans, &b_iter, ck->k,
                                  BTREE_UPDATE_KEY_CACHE_RECLAIM|
@@ -421,16 +625,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_USE_RESERVE|
                                  (ck->journal.seq == journal_last_seq(j)
-                                  ? BTREE_INSERT_JOURNAL_RESERVED
+                                  ? JOURNAL_WATERMARK_reserved
                                   : 0)|
                                  commit_flags);
-       if (ret) {
-               bch2_fs_fatal_err_on(ret != -EINTR &&
-                                    ret != -EAGAIN &&
-                                    !bch2_journal_error(j), c,
-                       "error flushing key cache: %i", ret);
+
+       bch2_fs_fatal_err_on(ret &&
+                            !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+                            !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+                            !bch2_journal_error(j), c,
+                            "error flushing key cache: %s", bch2_err_str(ret));
+       if (ret)
                goto out;
-       }
 
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
@@ -443,24 +648,22 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                        atomic_long_dec(&c->btree_key_cache.nr_dirty);
                }
        } else {
+               struct btree_path *path2;
 evict:
-               BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
+               trans_for_each_path(trans, path2)
+                       if (path2 != c_iter.path)
+                               __bch2_btree_path_unlock(trans, path2);
 
-               mark_btree_node_unlocked(c_iter.path, 0);
-               c_iter.path->l[0].b = NULL;
-
-               six_lock_write(&ck->c.lock, NULL, NULL);
+               bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
 
                if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                        clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
                        atomic_long_dec(&c->btree_key_cache.nr_dirty);
                }
 
+               mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
                bkey_cached_evict(&c->btree_key_cache, ck);
-
-               mutex_lock(&c->btree_key_cache.lock);
-               bkey_cached_free(&c->btree_key_cache, ck);
-               mutex_unlock(&c->btree_key_cache.lock);
+               bkey_cached_free_fast(&c->btree_key_cache, ck);
        }
 out:
        bch2_trans_iter_exit(trans, &b_iter);
@@ -475,11 +678,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        struct bkey_cached *ck =
                container_of(pin, struct bkey_cached, journal);
        struct bkey_cached_key key;
+       struct btree_trans trans;
+       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        int ret = 0;
 
-       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       bch2_trans_init(&trans, c, 0, 0);
 
-       six_lock_read(&ck->c.lock, NULL, NULL);
+       btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
        key = ck->key;
 
        if (ck->journal.seq != seq ||
@@ -489,12 +694,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        }
        six_unlock_read(&ck->c.lock);
 
-       ret = bch2_trans_do(c, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                btree_key_cache_flush_pos(&trans, key, seq,
                                BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
        srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
+       bch2_trans_exit(&trans);
        return ret;
 }
 
@@ -555,13 +761,26 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
        return true;
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-                              enum btree_id id, struct bpos pos)
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+                              struct btree_path *path)
 {
-       BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
+       struct bch_fs *c = trans->c;
+       struct bkey_cached *ck = (void *) path->l[0].b;
+
+       BUG_ON(!ck->valid);
+
+       /*
+        * We just did an update to the btree, bypassing the key cache: the key
+        * cache key is now stale and must be dropped, even if dirty:
+        */
+       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+               clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+               atomic_long_dec(&c->btree_key_cache.nr_dirty);
+               bch2_journal_pin_drop(&c->journal, &ck->journal);
+       }
+
+       ck->valid = false;
 }
-#endif
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc)
@@ -575,12 +794,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
        unsigned start, flags;
        int srcu_idx;
 
-       /* Return -1 if we can't do anything right now */
-       if (sc->gfp_mask & __GFP_FS)
-               mutex_lock(&bc->lock);
-       else if (!mutex_trylock(&bc->lock))
-               return -1;
-
+       mutex_lock(&bc->lock);
        srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        flags = memalloc_nofs_save();
 
@@ -588,14 +802,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
         * Newest freed entries are at the end of the list - once we hit one
         * that's too new to be freed, we can bail out:
         */
-       list_for_each_entry_safe(ck, t, &bc->freed, list) {
+       list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
                        break;
 
                list_del(&ck->list);
+               six_lock_pcpu_free(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
-               bc->nr_freed--;
+               atomic_long_dec(&bc->nr_freed);
+               scanned++;
+               freed++;
+       }
+
+       if (scanned >= nr)
+               goto out;
+
+       list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+               if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+                                                ck->btree_trans_barrier_seq))
+                       break;
+
+               list_del(&ck->list);
+               six_lock_pcpu_free(&ck->c.lock);
+               kmem_cache_free(bch2_key_cache, ck);
+               atomic_long_dec(&bc->nr_freed);
                scanned++;
                freed++;
        }
@@ -668,23 +899,45 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
        struct bkey_cached *ck, *n;
        struct rhash_head *pos;
        unsigned i;
+#ifdef __KERNEL__
+       int cpu;
+#endif
 
        if (bc->shrink.list.next)
                unregister_shrinker(&bc->shrink);
 
        mutex_lock(&bc->lock);
 
-       rcu_read_lock();
-       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-       if (tbl)
-               for (i = 0; i < tbl->size; i++)
-                       rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-                               bkey_cached_evict(bc, ck);
-                               list_add(&ck->list, &bc->freed);
-                       }
-       rcu_read_unlock();
+       /*
+        * The loop is needed to guard against racing with rehash:
+        */
+       while (atomic_long_read(&bc->nr_keys)) {
+               rcu_read_lock();
+               tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+               if (tbl)
+                       for (i = 0; i < tbl->size; i++)
+                               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                                       bkey_cached_evict(bc, ck);
+                                       list_add(&ck->list, &bc->freed_nonpcpu);
+                               }
+               rcu_read_unlock();
+       }
+
+#ifdef __KERNEL__
+       for_each_possible_cpu(cpu) {
+               struct btree_key_cache_freelist *f =
+                       per_cpu_ptr(bc->pcpu_freed, cpu);
 
-       list_for_each_entry_safe(ck, n, &bc->freed, list) {
+               for (i = 0; i < f->nr; i++) {
+                       ck = f->objs[i];
+                       list_add(&ck->list, &bc->freed_nonpcpu);
+               }
+       }
+#endif
+
+       list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+       list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
                cond_resched();
 
                bch2_journal_pin_drop(&c->journal, &ck->journal);
@@ -692,53 +945,80 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
                list_del(&ck->list);
                kfree(ck->k);
+               six_lock_pcpu_free(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
        }
 
-       BUG_ON(atomic_long_read(&bc->nr_dirty) &&
-              !bch2_journal_error(&c->journal) &&
-              test_bit(BCH_FS_WAS_RW, &c->flags));
-       BUG_ON(atomic_long_read(&bc->nr_keys));
+       if (atomic_long_read(&bc->nr_dirty) &&
+           !bch2_journal_error(&c->journal) &&
+           test_bit(BCH_FS_WAS_RW, &c->flags))
+               panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+                     atomic_long_read(&bc->nr_dirty));
+
+       if (atomic_long_read(&bc->nr_keys))
+               panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+                     atomic_long_read(&bc->nr_keys));
 
        mutex_unlock(&bc->lock);
 
        if (bc->table_init_done)
                rhashtable_destroy(&bc->table);
+
+       free_percpu(bc->pcpu_freed);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
        mutex_init(&c->lock);
-       INIT_LIST_HEAD(&c->freed);
+       INIT_LIST_HEAD(&c->freed_pcpu);
+       INIT_LIST_HEAD(&c->freed_nonpcpu);
 }
 
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
+static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
 {
+       struct btree_key_cache *bc =
+               container_of(shrink, struct btree_key_cache, shrink);
+
+       bch2_btree_key_cache_to_text(out, bc);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
        int ret;
 
-       ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+#ifdef __KERNEL__
+       bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+       if (!bc->pcpu_freed)
+               return -ENOMEM;
+#endif
+
+       ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
        if (ret)
                return ret;
 
-       c->table_init_done = true;
+       bc->table_init_done = true;
 
-       c->shrink.seeks                 = 1;
-       c->shrink.count_objects         = bch2_btree_key_cache_count;
-       c->shrink.scan_objects          = bch2_btree_key_cache_scan;
-       return register_shrinker(&c->shrink);
+       bc->shrink.seeks                = 0;
+       bc->shrink.count_objects        = bch2_btree_key_cache_count;
+       bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
+       bc->shrink.to_text              = bch2_btree_key_cache_shrinker_to_text;
+       return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-       pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
-       pr_buf(out, "nr_keys:\t%zu\n",  atomic_long_read(&c->nr_keys));
-       pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
+       prt_printf(out, "nr_freed:\t%zu",       atomic_long_read(&c->nr_freed));
+       prt_newline(out);
+       prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
+       prt_newline(out);
+       prt_printf(out, "nr_dirty:\t%lu",       atomic_long_read(&c->nr_dirty));
+       prt_newline(out);
 }
 
 void bch2_btree_key_cache_exit(void)
 {
-       if (bch2_key_cache)
-               kmem_cache_destroy(bch2_key_cache);
+       kmem_cache_destroy(bch2_key_cache);
 }
 
 int __init bch2_btree_key_cache_init(void)
index b3d241b134539e545a44557afd7fb16ebe87f4cf..670746e72dabae9cb3d56a5cbe69360ca4fdbe7d 100644 (file)
@@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *,
                        struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
-                               enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-                               enum btree_id id, struct bpos pos) {}
-#endif
+void bch2_btree_key_cache_drop(struct btree_trans *,
+                              struct btree_path *);
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
new file mode 100644 (file)
index 0000000..9d09043
--- /dev/null
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+struct lock_class_key bch2_btree_node_lock_key;
+
+/* Btree node locking: */
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+       if (lock->readers)
+               this_cpu_add(*lock->readers, nr);
+       else if (nr > 0)
+               atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+       else
+               atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+}
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+                                                 struct btree_path *skip,
+                                                 struct btree_bkey_cached_common *b,
+                                                 unsigned level)
+{
+       struct btree_path *path;
+       struct six_lock_count ret;
+
+       memset(&ret, 0, sizeof(ret));
+
+       if (IS_ERR_OR_NULL(b))
+               return ret;
+
+       trans_for_each_path(trans, path)
+               if (path != skip && &path->l[level].b->c == b) {
+                       int t = btree_node_locked_type(path, level);
+
+                       if (t != BTREE_NODE_UNLOCKED)
+                               ret.n[t]++;
+               }
+
+       return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+                       struct btree_path *path, struct btree *b)
+{
+       bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+       struct btree_trans              *trans;
+       struct btree_bkey_cached_common *node_want;
+       enum six_lock_type              lock_want;
+
+       /* for iterating over held locks :*/
+       u8                              path_idx;
+       u8                              level;
+       u64                             lock_start_time;
+};
+
+struct lock_graph {
+       struct trans_waiting_for_lock   g[8];
+       unsigned                        nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+       prt_newline(out);
+
+       for (i = g->g; i < g->g + g->nr; i++)
+               bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g; i != g->g + g->nr; i++) {
+               if (i != g->g)
+                       prt_str(out, "<- ");
+               prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+       }
+       prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+       closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+       closure_get(&trans->ref);
+
+       g->g[g->nr++] = (struct trans_waiting_for_lock) {
+               .trans          = trans,
+               .node_want      = trans->locking,
+               .lock_want      = trans->locking_wait.lock_want,
+       };
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g + 1; i < g->g + g->nr; i++)
+               if (i->trans->locking != i->node_want ||
+                   i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+                       while (g->g + g->nr > i)
+                               lock_graph_up(g);
+                       return true;
+               }
+
+       return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+       if (i == g->g) {
+               trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+               return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+       } else {
+               i->trans->lock_must_abort = true;
+               wake_up_process(i->trans->locking_wait.task);
+               return 0;
+       }
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+       if (trans->lock_may_not_fail)
+               return 0;
+       if (trans->locking_wait.lock_want == SIX_LOCK_write)
+               return 1;
+       if (!trans->in_traverse_all)
+               return 2;
+       return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+       struct trans_waiting_for_lock *i, *abort = NULL;
+       unsigned best = 0, pref;
+       int ret;
+
+       if (lock_graph_remove_non_waiters(g))
+               return 0;
+
+       /* Only checking, for debugfs: */
+       if (cycle) {
+               print_cycle(cycle, g);
+               ret = -1;
+               goto out;
+       }
+
+       for (i = g->g; i < g->g + g->nr; i++) {
+               pref = btree_trans_abort_preference(i->trans);
+               if (pref > best) {
+                       abort = i;
+                       best = pref;
+               }
+       }
+
+       if (unlikely(!best)) {
+               struct bch_fs *c = g->g->trans->c;
+               struct printbuf buf = PRINTBUF;
+
+               bch_err(c, "cycle of nofail locks");
+
+               for (i = g->g; i < g->g + g->nr; i++) {
+                       struct btree_trans *trans = i->trans;
+
+                       bch2_btree_trans_to_text(&buf, trans);
+
+                       prt_printf(&buf, "backtrace:");
+                       prt_newline(&buf);
+                       printbuf_indent_add(&buf, 2);
+                       bch2_prt_backtrace(&buf, trans->locking_wait.task);
+                       printbuf_indent_sub(&buf, 2);
+                       prt_newline(&buf);
+               }
+
+               bch2_print_string_as_lines(KERN_ERR, buf.buf);
+               printbuf_exit(&buf);
+               BUG();
+       }
+
+       ret = abort_lock(g, abort);
+out:
+       if (ret)
+               while (g->nr)
+                       lock_graph_up(g);
+       return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+                             struct printbuf *cycle)
+{
+       struct btree_trans *orig_trans = g->g->trans;
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g; i < g->g + g->nr; i++)
+               if (i->trans == trans)
+                       return break_cycle(g, cycle);
+
+       if (g->nr == ARRAY_SIZE(g->g)) {
+               if (orig_trans->lock_may_not_fail)
+                       return 0;
+
+               while (g->nr)
+                       lock_graph_up(g);
+               trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+               return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+       }
+
+       lock_graph_down(g, trans);
+       return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+       return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+       struct lock_graph g;
+       struct trans_waiting_for_lock *top;
+       struct btree_bkey_cached_common *b;
+       struct btree_path *path;
+       int ret;
+
+       if (trans->lock_must_abort) {
+               trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+       }
+
+       g.nr = 0;
+       lock_graph_down(&g, trans);
+next:
+       if (!g.nr)
+               return 0;
+
+       top = &g.g[g.nr - 1];
+
+       trans_for_each_path_from(top->trans, path, top->path_idx) {
+               if (!path->nodes_locked)
+                       continue;
+
+               if (top->path_idx != path->idx) {
+                       top->path_idx           = path->idx;
+                       top->level              = 0;
+                       top->lock_start_time    = 0;
+               }
+
+               for (;
+                    top->level < BTREE_MAX_DEPTH;
+                    top->level++, top->lock_start_time = 0) {
+                       int lock_held = btree_node_locked_type(path, top->level);
+
+                       if (lock_held == BTREE_NODE_UNLOCKED)
+                               continue;
+
+                       b = &READ_ONCE(path->l[top->level].b)->c;
+
+                       if (IS_ERR_OR_NULL(b)) {
+                               BUG_ON(!lock_graph_remove_non_waiters(&g));
+                               goto next;
+                       }
+
+                       if (list_empty_careful(&b->lock.wait_list))
+                               continue;
+
+                       raw_spin_lock(&b->lock.wait_lock);
+                       list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+                               BUG_ON(b != trans->locking);
+
+                               if (top->lock_start_time &&
+                                   time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+                                       continue;
+
+                               top->lock_start_time = trans->locking_wait.start_time;
+
+                               /* Don't check for self deadlock: */
+                               if (trans == top->trans ||
+                                   !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+                                       continue;
+
+                               ret = lock_graph_descend(&g, trans, cycle);
+                               raw_spin_unlock(&b->lock.wait_lock);
+
+                               if (ret)
+                                       return ret;
+                               goto next;
+
+                       }
+                       raw_spin_unlock(&b->lock.wait_lock);
+               }
+       }
+
+       if (g.nr > 1 && cycle)
+               print_chain(cycle, &g);
+       lock_graph_up(&g);
+       goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+       struct btree_trans *trans = p;
+
+       return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+                                struct btree_bkey_cached_common *b,
+                                bool lock_may_not_fail)
+{
+       int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+       int ret;
+
+       /*
+        * Must drop our read locks before calling six_lock_write() -
+        * six_unlock() won't do wakeups until the reader count
+        * goes to 0, and it's safe because we have the node intent
+        * locked:
+        */
+       six_lock_readers_add(&b->lock, -readers);
+       ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
+       six_lock_readers_add(&b->lock, readers);
+
+       if (ret)
+               mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+       return ret;
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+                                       struct btree_path *path,
+                                       bool upgrade)
+{
+       unsigned l = path->level;
+       int fail_idx = -1;
+
+       do {
+               if (!btree_path_node(path, l))
+                       break;
+
+               if (!(upgrade
+                     ? bch2_btree_node_upgrade(trans, path, l)
+                     : bch2_btree_node_relock(trans, path, l)))
+                       fail_idx = l;
+
+               l++;
+       } while (l < path->locks_want);
+
+       /*
+        * When we fail to get a lock, we have to ensure that any child nodes
+        * can't be relocked so bch2_btree_path_traverse has to walk back up to
+        * the node that we failed to relock:
+        */
+       if (fail_idx >= 0) {
+               __bch2_btree_path_unlock(trans, path);
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+               do {
+                       path->l[fail_idx].b = upgrade
+                               ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+                               : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+                       --fail_idx;
+               } while (fail_idx >= 0);
+       }
+
+       if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+               path->uptodate = BTREE_ITER_UPTODATE;
+
+       bch2_trans_verify_locks(trans);
+
+       return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+                             struct btree_path *path, unsigned level,
+                             bool trace)
+{
+       struct btree *b = btree_path_node(path, level);
+       int want = __btree_lock_want(path, level);
+
+       if (race_fault())
+               goto fail;
+
+       if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+           (btree_node_lock_seq_matches(path, b, level) &&
+            btree_node_lock_increment(trans, &b->c, level, want))) {
+               mark_btree_node_locked(trans, path, level, want);
+               return true;
+       }
+fail:
+       if (trace)
+               trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+       return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+                            struct btree_path *path, unsigned level)
+{
+       struct btree *b = path->l[level].b;
+       struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+       if (!is_btree_node(path, level))
+               return false;
+
+       switch (btree_lock_want(path, level)) {
+       case BTREE_NODE_UNLOCKED:
+               BUG_ON(btree_node_locked(path, level));
+               return true;
+       case BTREE_NODE_READ_LOCKED:
+               BUG_ON(btree_node_intent_locked(path, level));
+               return bch2_btree_node_relock(trans, path, level);
+       case BTREE_NODE_INTENT_LOCKED:
+               break;
+       case BTREE_NODE_WRITE_LOCKED:
+               BUG();
+       }
+
+       if (btree_node_intent_locked(path, level))
+               return true;
+
+       if (race_fault())
+               return false;
+
+       if (btree_node_locked(path, level)) {
+               bool ret;
+
+               six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+               ret = six_lock_tryupgrade(&b->c.lock);
+               six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+               if (ret)
+                       goto success;
+       } else {
+               if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+                       goto success;
+       }
+
+       /*
+        * Do we already have an intent lock via another path? If so, just bump
+        * lock count:
+        */
+       if (btree_node_lock_seq_matches(path, b, level) &&
+           btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+               btree_node_unlock(trans, path, level);
+               goto success;
+       }
+
+       trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+       return false;
+success:
+       mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+       return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+                                 struct btree_path *path)
+{
+       unsigned l;
+
+       for (l = path->level;
+            l < path->locks_want && btree_path_node(path, l);
+            l++) {
+               if (!bch2_btree_node_relock(trans, path, l)) {
+                       __bch2_btree_path_unlock(trans, path);
+                       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+                       trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+               }
+       }
+
+       return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+                       struct btree_path *path, unsigned long trace_ip)
+{
+       return btree_path_get_locks(trans, path, false);
+}
+
+__flatten
+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+                       struct btree_path *path, unsigned long trace_ip)
+{
+       return btree_path_get_locks(trans, path, true);
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+                              struct btree_path *path,
+                              unsigned new_locks_want)
+{
+       EBUG_ON(path->locks_want >= new_locks_want);
+
+       path->locks_want = new_locks_want;
+
+       return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+                              struct btree_path *path,
+                              unsigned new_locks_want)
+{
+       struct btree_path *linked;
+
+       if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+               return true;
+
+       /*
+        * XXX: this is ugly - we'd prefer to not be mucking with other
+        * iterators in the btree_trans here.
+        *
+        * On failure to upgrade the iterator, setting iter->locks_want and
+        * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+        * get the locks we want on transaction restart.
+        *
+        * But if this iterator was a clone, on transaction restart what we did
+        * to this iterator isn't going to be preserved.
+        *
+        * Possibly we could add an iterator field for the parent iterator when
+        * an iterator is a copy - for now, we'll just upgrade any other
+        * iterators with the same btree id.
+        *
+        * The code below used to be needed to ensure ancestor nodes get locked
+        * before interior nodes - now that's handled by
+        * bch2_btree_path_traverse_all().
+        */
+       if (!path->cached && !trans->in_traverse_all)
+               trans_for_each_path(trans, linked)
+                       if (linked != path &&
+                           linked->cached == path->cached &&
+                           linked->btree_id == path->btree_id &&
+                           linked->locks_want < new_locks_want) {
+                               linked->locks_want = new_locks_want;
+                               btree_path_get_locks(trans, linked, true);
+                       }
+
+       return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+                                struct btree_path *path,
+                                unsigned new_locks_want)
+{
+       unsigned l;
+
+       EBUG_ON(path->locks_want < new_locks_want);
+
+       path->locks_want = new_locks_want;
+
+       while (path->nodes_locked &&
+              (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+               if (l > path->level) {
+                       btree_node_unlock(trans, path, l);
+               } else {
+                       if (btree_node_intent_locked(path, l)) {
+                               six_lock_downgrade(&path->l[l].b->c.lock);
+                               mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
+                       }
+                       break;
+               }
+       }
+
+       bch2_btree_path_verify_locks(path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       if (unlikely(trans->restarted))
+               return -((int) trans->restarted);
+
+       trans_for_each_path(trans, path)
+               if (path->should_be_locked &&
+                   !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+                       trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+               }
+       return 0;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               __bch2_btree_path_unlock(trans, path);
+
+       /*
+        * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+        * btree nodes, it implements its own walking:
+        */
+       EBUG_ON(!trans->is_initial_gc &&
+               lock_class_is_held(&bch2_btree_node_lock_key));
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               if (path->nodes_locked)
+                       return true;
+       return false;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+       unsigned l;
+
+       if (!path->nodes_locked) {
+               BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+                      btree_path_node(path, path->level));
+               return;
+       }
+
+       for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+               int want = btree_lock_want(path, l);
+               int have = btree_node_locked_type(path, l);
+
+               BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+               BUG_ON(is_btree_node(path, l) &&
+                      (want == BTREE_NODE_UNLOCKED ||
+                       have != BTREE_NODE_WRITE_LOCKED) &&
+                      want != have);
+       }
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               bch2_btree_path_verify_locks(path);
+}
+
+#endif
index b4434eca0746c7635c10b291534f526661dc4cf6..bf8d1880673b224a09390e911dc50dfff03bb3b5 100644 (file)
 
 #include "btree_iter.h"
 
+extern struct lock_class_key bch2_btree_node_lock_key;
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+       return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+       return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+               ? &trans->c->btree_transaction_stats[trans->fn_idx]
+               : NULL;
+}
+
 /* matches six lock types */
 enum btree_node_locked_type {
        BTREE_NODE_UNLOCKED             = -1,
        BTREE_NODE_READ_LOCKED          = SIX_LOCK_read,
        BTREE_NODE_INTENT_LOCKED        = SIX_LOCK_intent,
+       BTREE_NODE_WRITE_LOCKED         = SIX_LOCK_write,
 };
 
 static inline int btree_node_locked_type(struct btree_path *path,
                                         unsigned level)
 {
-       /*
-        * We're relying on the fact that if nodes_intent_locked is set
-        * nodes_locked must be set as well, so that we can compute without
-        * branches:
-        */
-       return BTREE_NODE_UNLOCKED +
-               ((path->nodes_locked >> level) & 1) +
-               ((path->nodes_intent_locked >> level) & 1);
+       return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
 }
 
-static inline bool btree_node_intent_locked(struct btree_path *path,
-                                           unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
 {
-       return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
+       return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_path *path,
-                                         unsigned level)
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
 {
-       return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
+       return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
 {
-       return path->nodes_locked & (1 << level);
+       return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
 }
 
-static inline void mark_btree_node_unlocked(struct btree_path *path,
-                                           unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-       path->nodes_locked &= ~(1 << level);
-       path->nodes_intent_locked &= ~(1 << level);
+       return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
 }
 
-static inline void mark_btree_node_locked(struct btree_path *path,
-                                         unsigned level,
-                                         enum six_lock_type type)
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+                                                 unsigned level,
+                                                 enum btree_node_locked_type type)
 {
        /* relying on this to avoid a branch */
        BUILD_BUG_ON(SIX_LOCK_read   != 0);
        BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-       path->nodes_locked |= 1 << level;
-       path->nodes_intent_locked |= type << level;
+       path->nodes_locked &= ~(3U << (level << 1));
+       path->nodes_locked |= (type + 1) << (level << 1);
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
-                                                unsigned level)
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+                                           unsigned level)
+{
+       EBUG_ON(btree_node_write_locked(path, level));
+       mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+                                         struct btree_path *path,
+                                         unsigned level,
+                                         enum six_lock_type type)
 {
-       mark_btree_node_locked(path, level, SIX_LOCK_intent);
+       mark_btree_node_locked_noreset(path, level, type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+       path->l[level].lock_taken_time = local_clock();
+#endif
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -95,161 +111,308 @@ btree_lock_want(struct btree_path *path, int level)
        return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_path *path, unsigned level)
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+                                             struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+       struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+       if (s)
+               __bch2_time_stats_update(&s->lock_hold_times,
+                                        path->l[level].lock_taken_time,
+                                        local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+                                    struct btree_path *path, unsigned level)
 {
        int lock_type = btree_node_locked_type(path, level);
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-       if (lock_type != BTREE_NODE_UNLOCKED)
+       if (lock_type != BTREE_NODE_UNLOCKED) {
                six_unlock_type(&path->l[level].b->c.lock, lock_type);
+               btree_trans_lock_hold_time_update(trans, path, level);
+       }
        mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_path_unlock(struct btree_path *path)
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+       return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+       return __fls(path->nodes_locked) >> 1;
+}
+
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+                                           struct btree_path *path)
 {
        btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
        while (path->nodes_locked)
-               btree_node_unlock(path, __ffs(path->nodes_locked));
+               btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
 }
 
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+                                    struct btree *b)
 {
-       switch (type) {
-       case SIX_LOCK_read:
-               return BCH_TIME_btree_lock_contended_read;
-       case SIX_LOCK_intent:
-               return BCH_TIME_btree_lock_contended_intent;
-       case SIX_LOCK_write:
-               return BCH_TIME_btree_lock_contended_write;
-       default:
-               BUG();
-       }
+       struct btree_path *linked;
+
+       EBUG_ON(path->l[b->c.level].b != b);
+       EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+       EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+       mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+
+       trans_for_each_path_with_node(trans, b, linked)
+               linked->l[b->c.level].lock_seq += 2;
+
+       six_unlock_write(&b->c.lock);
 }
 
-static inline bool btree_node_lock_type(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      struct btree *b,
-                                      struct bpos pos, unsigned level,
-                                      enum six_lock_type type,
-                                      six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       struct bch_fs *c = trans->c;
-       u64 start_time;
-       bool ret;
+void bch2_btree_node_unlock_write(struct btree_trans *,
+                       struct btree_path *, struct btree *);
 
-       if (six_trylock_type(&b->c.lock, type))
-               return true;
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 
-       start_time = local_clock();
+/* lock: */
 
-       trans->locking_path_idx = path->idx;
-       trans->locking_pos      = pos;
-       trans->locking_btree_id = path->btree_id;
-       trans->locking_level    = level;
-       trans->locking_lock_type = type;
-       trans->locking          = b;
-       ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-       trans->locking = NULL;
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+                                        struct btree_bkey_cached_common *b,
+                                        enum six_lock_type type,
+                                        bool lock_may_not_fail)
+{
+       int ret;
 
-       if (ret)
-               bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+       trans->lock_may_not_fail = lock_may_not_fail;
+       trans->lock_must_abort  = false;
+       trans->locking          = b;
 
+       ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+                                  bch2_six_check_for_deadlock, trans);
+       WRITE_ONCE(trans->locking, NULL);
+       WRITE_ONCE(trans->locking_wait.start_time, 0);
        return ret;
 }
 
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+                      struct btree_bkey_cached_common *b,
+                      enum six_lock_type type)
+{
+       return __btree_node_lock_nopath(trans, b, type, false);
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+                                        struct btree_bkey_cached_common *b,
+                                        enum six_lock_type type)
+{
+       int ret = __btree_node_lock_nopath(trans, b, type, true);
+
+       BUG_ON(ret);
+}
+
 /*
  * Lock a btree node if we already have it locked on one of our linked
  * iterators:
  */
 static inline bool btree_node_lock_increment(struct btree_trans *trans,
-                                            struct btree *b, unsigned level,
+                                            struct btree_bkey_cached_common *b,
+                                            unsigned level,
                                             enum btree_node_locked_type want)
 {
        struct btree_path *path;
 
        trans_for_each_path(trans, path)
-               if (path->l[level].b == b &&
+               if (&path->l[level].b->c == b &&
                    btree_node_locked_type(path, level) >= want) {
-                       six_lock_increment(&b->c.lock, want);
+                       six_lock_increment(&b->lock, want);
                        return true;
                }
 
        return false;
 }
 
-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
-                           struct btree *, struct bpos, unsigned,
-                           enum six_lock_type,
-                           six_lock_should_sleep_fn, void *,
-                           unsigned long);
-
-static inline bool btree_node_lock(struct btree_trans *trans,
+static inline int btree_node_lock(struct btree_trans *trans,
                        struct btree_path *path,
-                       struct btree *b, struct bpos pos, unsigned level,
+                       struct btree_bkey_cached_common *b,
+                       unsigned level,
                        enum six_lock_type type,
-                       six_lock_should_sleep_fn should_sleep_fn, void *p,
                        unsigned long ip)
 {
+       int ret = 0;
+
        EBUG_ON(level >= BTREE_MAX_DEPTH);
        EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-       return likely(six_trylock_type(&b->c.lock, type)) ||
-               btree_node_lock_increment(trans, b, level, type) ||
-               __bch2_btree_node_lock(trans, path, b, pos, level, type,
-                                      should_sleep_fn, p, ip);
+       if (likely(six_trylock_type(&b->lock, type)) ||
+           btree_node_lock_increment(trans, b, level, type) ||
+           !(ret = btree_node_lock_nopath(trans, b, type))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+               path->l[b->level].lock_taken_time = local_clock();
+#endif
+       }
+
+       return ret;
 }
 
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+                                struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+                                         struct btree_path *path,
+                                         struct btree_bkey_cached_common *b,
+                                         bool lock_may_not_fail)
+{
+       EBUG_ON(&path->l[b->level].b->c != b);
+       EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+       EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+       /*
+        * six locks are unfair, and read locks block while a thread wants a
+        * write lock: thus, we need to tell the cycle detector we have a write
+        * lock _before_ taking the lock:
+        */
+       mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
+
+       return likely(six_trylock_write(&b->lock))
+               ? 0
+               : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+                                             struct btree_path *path,
+                                             struct btree_bkey_cached_common *b)
+{
+       int ret = __btree_node_lock_write(trans, path, b, true);
+       BUG_ON(ret);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+                          struct btree_path *path,
+                          struct btree_bkey_cached_common *b)
+{
+       return __btree_node_lock_write(trans, path, b, false);
+}
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+                                     struct btree_path *, unsigned long);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
 
 static inline bool bch2_btree_node_relock(struct btree_trans *trans,
                                          struct btree_path *path, unsigned level)
 {
        EBUG_ON(btree_node_locked(path, level) &&
-               btree_node_locked_type(path, level) !=
-               __btree_lock_want(path, level));
+               !btree_node_write_locked(path, level) &&
+               btree_node_locked_type(path, level) != __btree_lock_want(path, level));
 
        return likely(btree_node_locked(path, level)) ||
-               __bch2_btree_node_relock(trans, path, level);
+               (!IS_ERR_OR_NULL(path->l[level].b) &&
+                __bch2_btree_node_relock(trans, path, level, true));
 }
 
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-                                    struct btree *b)
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+                                                 struct btree_path *path, unsigned level)
 {
-       struct btree_path *linked;
+       EBUG_ON(btree_node_locked(path, level) &&
+               !btree_node_write_locked(path, level) &&
+               btree_node_locked_type(path, level) != __btree_lock_want(path, level));
 
-       EBUG_ON(path->l[b->c.level].b != b);
-       EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+       return likely(btree_node_locked(path, level)) ||
+               (!IS_ERR_OR_NULL(path->l[level].b) &&
+                __bch2_btree_node_relock(trans, path, level, false));
+}
 
-       trans_for_each_path_with_node(trans, b, linked)
-               linked->l[b->c.level].lock_seq += 2;
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+                               struct btree_path *path, unsigned long trace_ip)
+{
+       if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+               trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+       }
 
-       six_unlock_write(&b->c.lock);
+       return 0;
 }
 
-void bch2_btree_node_unlock_write(struct btree_trans *,
-                       struct btree_path *, struct btree *);
+/* upgrade */
 
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+                              struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+                              struct btree_path *, unsigned);
 
-static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
-                                             struct btree_path *path,
-                                             struct btree *b)
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+                                         struct btree_path *path,
+                                         unsigned new_locks_want)
 {
-       EBUG_ON(path->l[b->c.level].b != b);
-       EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
-       EBUG_ON(!btree_node_intent_locked(path, b->c.level));
+       unsigned old_locks_want = path->locks_want;
+
+       new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+       if (path->locks_want < new_locks_want
+           ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+           : path->uptodate == BTREE_ITER_UPTODATE)
+               return 0;
 
-       if (unlikely(!six_trylock_write(&b->c.lock)))
-               __bch2_btree_node_lock_write(trans, b);
+       trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+                       old_locks_want, new_locks_want);
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 }
 
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+       EBUG_ON(!btree_node_locked(path, path->level));
+       EBUG_ON(path->uptodate);
+
+       path->should_be_locked = true;
+}
 
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+                                     struct btree_path *path,
+                                     unsigned l)
+{
+       btree_node_unlock(trans, path, l);
+       path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
 
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+                                   struct btree_path *path)
+{
+       __btree_path_set_level_up(trans, path, path->level++);
+       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+                               struct btree_path *,
+                               struct btree_bkey_cached_common *b,
+                               unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
index 68272f26f0171f889b3986a981ddb9c771c517f7..892d1231755164dede6deafd9b61ad65148d9919 100644 (file)
@@ -6,8 +6,9 @@
 #include <linux/rhashtable.h>
 #include <linux/six.h>
 
-#include "bkey_methods.h"
+//#include "bkey_methods.h"
 #include "buckets_types.h"
+#include "darray.h"
 #include "journal_types.h"
 
 struct open_bucket;
@@ -62,6 +63,7 @@ struct btree_bkey_cached_common {
        struct six_lock         lock;
        u8                      level;
        u8                      btree_id;
+       bool                    cached;
 };
 
 struct btree {
@@ -152,11 +154,22 @@ struct btree_cache {
        struct mutex            lock;
        struct list_head        live;
        struct list_head        freeable;
-       struct list_head        freed;
+       struct list_head        freed_pcpu;
+       struct list_head        freed_nonpcpu;
 
        /* Number of elements in live + freeable lists */
        unsigned                used;
        unsigned                reserve;
+       unsigned                freed;
+       unsigned                not_freed_lock_intent;
+       unsigned                not_freed_lock_write;
+       unsigned                not_freed_dirty;
+       unsigned                not_freed_read_in_flight;
+       unsigned                not_freed_write_in_flight;
+       unsigned                not_freed_noevict;
+       unsigned                not_freed_write_blocked;
+       unsigned                not_freed_will_make_reachable;
+       unsigned                not_freed_access_bit;
        atomic_t                dirty;
        struct shrinker         shrink;
 
@@ -180,22 +193,16 @@ struct btree_node_iter {
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
 #define BTREE_ITER_SLOTS               (1 << 0)
+#define BTREE_ITER_ALL_LEVELS          (1 << 1)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT              (1 << 1)
+#define BTREE_ITER_INTENT              (1 << 2)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH            (1 << 2)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 3)
+#define BTREE_ITER_PREFETCH            (1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
@@ -203,15 +210,13 @@ struct btree_node_iter {
 #define BTREE_ITER_IS_EXTENTS          (1 << 4)
 #define BTREE_ITER_NOT_EXTENTS         (1 << 5)
 #define BTREE_ITER_CACHED              (1 << 6)
-#define BTREE_ITER_CACHED_NOFILL       (1 << 7)
-#define BTREE_ITER_CACHED_NOCREATE     (1 << 8)
-#define BTREE_ITER_WITH_KEY_CACHE      (1 << 9)
-#define BTREE_ITER_WITH_UPDATES                (1 << 10)
-#define BTREE_ITER_WITH_JOURNAL                (1 << 11)
-#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 13)
-#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 14)
-#define BTREE_ITER_NOPRESERVE          (1 << 15)
+#define BTREE_ITER_WITH_KEY_CACHE      (1 << 7)
+#define BTREE_ITER_WITH_UPDATES                (1 << 8)
+#define BTREE_ITER_WITH_JOURNAL                (1 << 9)
+#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 10)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 11)
+#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 12)
+#define BTREE_ITER_NOPRESERVE          (1 << 13)
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -219,15 +224,6 @@ enum btree_path_uptodate {
        BTREE_ITER_NEED_TRAVERSE        = 2,
 };
 
-#define BTREE_ITER_NO_NODE_GET_LOCKS   ((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP                ((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT   ((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP          ((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN                ((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT                ((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR       ((struct btree *) 7)
-#define BTREE_ITER_NO_NODE_CACHED      ((struct btree *) 8)
-
 struct btree_path {
        u8                      idx;
        u8                      sorted_idx;
@@ -247,14 +243,16 @@ struct btree_path {
         */
        bool                    should_be_locked:1;
        unsigned                level:3,
-                               locks_want:4,
-                               nodes_locked:4,
-                               nodes_intent_locked:4;
+                               locks_want:4;
+       u8                      nodes_locked;
 
        struct btree_path_level {
                struct btree    *b;
                struct btree_node_iter iter;
                u32             lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+               u64             lock_taken_time;
+#endif
        }                       l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
        unsigned long           ip_allocated;
@@ -280,7 +278,8 @@ struct btree_iter {
        struct btree_path       *key_cache_path;
 
        enum btree_id           btree_id:4;
-       unsigned                min_depth:4;
+       unsigned                min_depth:3;
+       unsigned                advanced:1;
 
        /* btree_iter_copy starts here: */
        u16                     flags;
@@ -295,20 +294,31 @@ struct btree_iter {
         * bch2_btree_iter_next_slot() can correctly advance pos.
         */
        struct bkey             k;
+
+       /* BTREE_ITER_WITH_JOURNAL: */
+       size_t                  journal_idx;
+       struct bpos             journal_pos;
 #ifdef CONFIG_BCACHEFS_DEBUG
        unsigned long           ip_allocated;
 #endif
 };
 
+struct btree_key_cache_freelist {
+       struct bkey_cached      *objs[16];
+       unsigned                nr;
+};
+
 struct btree_key_cache {
        struct mutex            lock;
        struct rhashtable       table;
        bool                    table_init_done;
-       struct list_head        freed;
+       struct list_head        freed_pcpu;
+       struct list_head        freed_nonpcpu;
        struct shrinker         shrink;
        unsigned                shrink_iter;
+       struct btree_key_cache_freelist __percpu *pcpu_freed;
 
-       size_t                  nr_freed;
+       atomic_long_t           nr_freed;
        atomic_long_t           nr_keys;
        atomic_long_t           nr_dirty;
 };
@@ -325,7 +335,7 @@ struct bkey_cached {
        struct btree_bkey_cached_common c;
 
        unsigned long           flags;
-       u                     u64s;
+       u16                     u64s;
        bool                    valid;
        u32                     btree_trans_barrier_seq;
        struct bkey_cached_key  key;
@@ -339,16 +349,32 @@ struct bkey_cached {
        struct bkey_i           *k;
 };
 
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+       return !b->cached
+               ? container_of(b, struct btree, c)->key.k.p
+               : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
 struct btree_insert_entry {
        unsigned                flags;
        u8                      bkey_type;
        enum btree_id           btree_id:8;
-       u8                      level;
+       u8                      level:4;
        bool                    cached:1;
        bool                    insert_trigger_run:1;
        bool                    overwrite_trigger_run:1;
+       bool                    key_cache_already_flushed:1;
+       /*
+        * @old_k may be a key from the journal; @old_btree_u64s always refers
+        * to the size of the key being overwritten in the btree:
+        */
+       u8                      old_btree_u64s;
        struct bkey_i           *k;
        struct btree_path       *path;
+       /* key being overwritten: */
+       struct bkey             old_k;
+       const struct bch_val    *old_v;
        unsigned long           ip_allocated;
 };
 
@@ -366,36 +392,48 @@ struct btree_trans_commit_hook {
        struct btree_trans_commit_hook  *next;
 };
 
-#define BTREE_TRANS_MEM_MAX    (1U << 14)
+#define BTREE_TRANS_MEM_MAX    (1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS      10000
 
 struct btree_trans {
        struct bch_fs           *c;
        const char              *fn;
+       struct closure          ref;
        struct list_head        list;
-       struct btree            *locking;
-       unsigned                locking_path_idx;
-       struct bpos             locking_pos;
-       u8                      locking_btree_id;
-       u8                      locking_level;
-       u8                      locking_lock_type;
-       pid_t                   pid;
+       u64                     last_begin_time;
+
+       u8                      lock_may_not_fail;
+       u8                      lock_must_abort;
+       struct btree_bkey_cached_common *locking;
+       struct six_lock_waiter  locking_wait;
+
        int                     srcu_idx;
 
+       u8                      fn_idx;
        u8                      nr_sorted;
        u8                      nr_updates;
+       u8                      traverse_all_idx;
        bool                    used_mempool:1;
        bool                    in_traverse_all:1;
-       bool                    restarted:1;
-       bool                    journal_transaction_names:1;
+       bool                    memory_allocation_failure:1;
+       bool                    is_initial_gc:1;
+       bool                    journal_replay_not_finished:1;
+       enum bch_errcode        restarted:16;
+       u32                     restart_count;
+       unsigned long           last_restarted_ip;
+
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
         * extent:
         */
        unsigned                extra_journal_res;
+       unsigned                nr_max_paths;
 
        u64                     paths_allocated;
 
        unsigned                mem_top;
+       unsigned                mem_max;
        unsigned                mem_bytes;
        void                    *mem;
 
@@ -405,8 +443,7 @@ struct btree_trans {
 
        /* update path: */
        struct btree_trans_commit_hook *hooks;
-       struct jset_entry       *extra_journal_entries;
-       unsigned                extra_journal_entry_u64s;
+       DARRAY(u64)             extra_journal_entries;
        struct journal_entry_pin *journal_pin;
 
        struct journal_res      journal_res;
@@ -419,7 +456,31 @@ struct btree_trans {
        struct replicas_delta_list *fs_usage_deltas;
 };
 
-#define BTREE_FLAG(flag)                                               \
+#define BTREE_FLAGS()                                                  \
+       x(read_in_flight)                                               \
+       x(read_error)                                                   \
+       x(dirty)                                                        \
+       x(need_write)                                                   \
+       x(write_blocked)                                                \
+       x(will_make_reachable)                                          \
+       x(noevict)                                                      \
+       x(write_idx)                                                    \
+       x(accessed)                                                     \
+       x(write_in_flight)                                              \
+       x(write_in_flight_inner)                                        \
+       x(just_written)                                                 \
+       x(dying)                                                        \
+       x(fake)                                                         \
+       x(need_rewrite)                                                 \
+       x(never_write)
+
+enum btree_flags {
+#define x(flag)        BTREE_NODE_##flag,
+       BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag)                                                                \
 static inline bool btree_node_ ## flag(struct btree *b)                        \
 {      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
                                                                        \
@@ -429,36 +490,8 @@ static inline void set_btree_node_ ## flag(struct btree *b)                \
 static inline void clear_btree_node_ ## flag(struct btree *b)          \
 {      clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 
-enum btree_flags {
-       BTREE_NODE_read_in_flight,
-       BTREE_NODE_read_error,
-       BTREE_NODE_dirty,
-       BTREE_NODE_need_write,
-       BTREE_NODE_noevict,
-       BTREE_NODE_write_idx,
-       BTREE_NODE_accessed,
-       BTREE_NODE_write_in_flight,
-       BTREE_NODE_write_in_flight_inner,
-       BTREE_NODE_just_written,
-       BTREE_NODE_dying,
-       BTREE_NODE_fake,
-       BTREE_NODE_need_rewrite,
-       BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
@@ -588,24 +621,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
        return __btree_node_type(b->c.level, b->c.btree_id);
 }
 
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-       switch (type) {
-       case BKEY_TYPE_extents:
-       case BKEY_TYPE_reflink:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
-       return btree_node_type_is_extents(btree_node_type(b));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
        ((1U << BKEY_TYPE_extents)|                     \
+        (1U << BKEY_TYPE_alloc)|                       \
         (1U << BKEY_TYPE_inodes)|                      \
         (1U << BKEY_TYPE_stripes)|                     \
         (1U << BKEY_TYPE_reflink)|                     \
@@ -621,6 +639,16 @@ static inline bool btree_node_is_extents(struct btree *b)
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
         BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_IS_EXTENTS                            \
+       ((1U << BTREE_ID_extents)|                      \
+        (1U << BTREE_ID_reflink)|                      \
+        (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+       return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
 #define BTREE_ID_HAS_SNAPSHOTS                         \
        ((1U << BTREE_ID_extents)|                      \
         (1U << BTREE_ID_inodes)|                       \
@@ -636,40 +664,10 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
        return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
 }
 
-enum btree_update_flags {
-       __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
-       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
-       __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
-
-       __BTREE_TRIGGER_INSERT,
-       __BTREE_TRIGGER_OVERWRITE,
-
-       __BTREE_TRIGGER_GC,
-       __BTREE_TRIGGER_BUCKET_INVALIDATE,
-       __BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-#define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT           (1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE                (1U << __BTREE_TRIGGER_OVERWRITE)
-
-#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
-
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_alloc)|                \
-        (1U << KEY_TYPE_alloc_v2)|             \
-        (1U << KEY_TYPE_alloc_v3)|             \
-        (1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode)|                \
-        (1U << KEY_TYPE_inode_v2)|             \
-        (1U << KEY_TYPE_snapshot))
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+       return (1 << id) & BTREE_ID_HAS_PTRS;
+}
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
index d9a406a28f4728b920b74a353f606f56c57e0dc7..1c2e7b2b4ed5b01b00b9e61d6bdaf0802bc69c48 100644 (file)
@@ -8,20 +8,20 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
-                                    struct btree *);
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+                                   struct btree_path *, struct btree *);
 bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
                                struct btree *, struct btree_node_iter *,
                                struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
-       __BTREE_INSERT_NOFAIL,
+       /* First two bits for journal watermark: */
+       __BTREE_INSERT_NOFAIL = 2,
        __BTREE_INSERT_NOCHECK_RW,
        __BTREE_INSERT_LAZY_RW,
        __BTREE_INSERT_USE_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
-       __BTREE_INSERT_JOURNAL_RESERVED,
        __BTREE_INSERT_JOURNAL_RECLAIM,
        __BTREE_INSERT_NOWAIT,
        __BTREE_INSERT_GC_LOCK_HELD,
@@ -41,9 +41,6 @@ enum btree_insert_flags {
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
-/* Indicates that we have pre-reserved space in the journal: */
-#define BTREE_INSERT_JOURNAL_RESERVED  (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-
 /* Insert is being called from journal reclaim path: */
 #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
 
@@ -54,6 +51,8 @@ enum btree_insert_flags {
 #define BCH_HASH_SET_MUST_CREATE       (1 << __BCH_HASH_SET_MUST_CREATE)
 #define BCH_HASH_SET_MUST_REPLACE      (1 << __BCH_HASH_SET_MUST_REPLACE)
 
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+                               unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
 int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
@@ -83,13 +82,14 @@ void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
+int bch2_trans_log_msg(struct btree_trans *, const char *);
+
 /**
  * bch2_trans_commit - insert keys at given iterator positions
  *
  * This is main entry point for btree updates.
  *
  * Return values:
- * -EINTR: locking changed, this function should be called again.
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
@@ -105,30 +105,33 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
        return __bch2_trans_commit(trans);
 }
 
-#define lockrestart_do(_trans, _do)                                    \
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)        \
+       lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+                                       (_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+       nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+                                       (_journal_seq), (_flags)))
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
 ({                                                                     \
+       struct btree_trans trans;                                       \
        int _ret;                                                       \
                                                                        \
-       do {                                                            \
-               bch2_trans_begin(_trans);                               \
-               _ret = (_do);                                           \
-       } while (_ret == -EINTR);                                       \
+       bch2_trans_init(&trans, (_c), 0, 0);                            \
+       _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
+       bch2_trans_exit(&trans);                                        \
                                                                        \
        _ret;                                                           \
 })
 
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)  \
-       lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_flags)))
-
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
+#define bch2_trans_run(_c, _do)                                                \
 ({                                                                     \
        struct btree_trans trans;                                       \
        int _ret;                                                       \
                                                                        \
        bch2_trans_init(&trans, (_c), 0, 0);                            \
-       _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
-                              _do);                                    \
+       _ret = (_do);                                                   \
        bch2_trans_exit(&trans);                                        \
                                                                        \
        _ret;                                                           \
@@ -139,4 +142,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
             (_i) < (_trans)->updates + (_trans)->nr_updates;           \
             (_i)++)
 
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_update(trans, i)
+               bch2_path_put(trans, i->path, true);
+
+       trans->extra_journal_res        = 0;
+       trans->nr_updates               = 0;
+       trans->hooks                    = NULL;
+       trans->extra_journal_entries.nr = 0;
+}
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index 088c320493d3c133bd1b61a8832ab3a3b3e6c754..40debf7563f8d5fdecd6185c3823d7dfda72b1cd 100644 (file)
 #include <linux/random.h>
 #include <trace/events/bcachefs.h>
 
-static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-                                  struct btree_path *, struct btree *,
-                                  struct keylist *, unsigned);
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+                                 struct btree_path *, struct btree *,
+                                 struct keylist *, unsigned);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+                                               enum btree_id btree_id,
+                                               unsigned level,
+                                               struct bpos pos)
+{
+       struct btree_path *path;
+
+       path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+                            BTREE_ITER_NOPRESERVE|
+                            BTREE_ITER_INTENT, _RET_IP_);
+       path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+       bch2_btree_path_downgrade(trans, path);
+       __bch2_btree_path_unlock(trans, path);
+       return path;
+}
+
 /* Debug code: */
 
 /*
@@ -41,7 +57,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
        struct bkey_s_c k;
        struct bkey_s_c_btree_ptr_v2 bp;
        struct bkey unpacked;
-       char buf1[100], buf2[100];
+       struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
        BUG_ON(!b->c.level);
 
@@ -58,9 +74,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 
                if (bpos_cmp(next_node, bp.v->min_key)) {
                        bch2_dump_btree_node(c, b);
-                       panic("expected next min_key %s got %s\n",
-                             (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
-                             (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+                       bch2_bpos_to_text(&buf1, next_node);
+                       bch2_bpos_to_text(&buf2, bp.v->min_key);
+                       panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
                }
 
                bch2_btree_node_iter_advance(&iter, b);
@@ -68,9 +84,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                if (bch2_btree_node_iter_end(&iter)) {
                        if (bpos_cmp(k.k->p, b->key.k.p)) {
                                bch2_dump_btree_node(c, b);
-                               panic("expected end %s got %s\n",
-                                     (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
-                                     (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+                               bch2_bpos_to_text(&buf1, b->key.k.p);
+                               bch2_bpos_to_text(&buf2, k.k->p);
+                               panic("expected end %s got %s\n", buf1.buf, buf2.buf);
                        }
                        break;
                }
@@ -143,7 +159,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
-       trace_btree_node_free(c, b);
+       trace_and_count(c, btree_node_free, c, b);
 
        BUG_ON(btree_node_dirty(b));
        BUG_ON(btree_node_need_write(b));
@@ -160,29 +176,69 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 }
 
 static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+                                      struct btree_path *path,
                                       struct btree *b)
 {
        struct bch_fs *c = trans->c;
-       struct btree_path *path;
+       unsigned level = b->c.level;
+
+       bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->c.lock);
+       mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
 
        trans_for_each_path(trans, path)
-               BUG_ON(path->l[b->c.level].b == b &&
-                      path->l[b->c.level].lock_seq == b->c.lock.state.seq);
+               if (path->l[level].b == b) {
+                       btree_node_unlock(trans, path, level);
+                       path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+               }
+}
 
-       six_lock_write(&b->c.lock, NULL, NULL);
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+                                           struct btree_trans *trans,
+                                           struct btree *b)
+{
+       struct bch_fs *c = as->c;
+       struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+       struct btree_path *path;
+       unsigned level = b->c.level;
+
+       BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+       b->will_make_reachable = 0;
+       closure_put(&as->cl);
 
+       clear_btree_node_will_make_reachable(b);
+       clear_btree_node_accessed(b);
+       clear_btree_node_dirty_acct(c, b);
+       clear_btree_node_need_write(b);
+
+       mutex_lock(&c->btree_cache.lock);
+       list_del_init(&b->list);
        bch2_btree_node_hash_remove(&c->btree_cache, b);
-       __btree_node_free(c, b);
+       mutex_unlock(&c->btree_cache.lock);
+
+       BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+       p->b[p->nr++] = b;
 
-       six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
+
+       trans_for_each_path(trans, path)
+               if (path->l[level].b == b) {
+                       btree_node_unlock(trans, path, level);
+                       path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+               }
 }
 
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
                                             struct disk_reservation *res,
                                             struct closure *cl,
+                                            bool interior_node,
                                             unsigned flags)
 {
+       struct bch_fs *c = trans->c;
        struct write_point *wp;
        struct btree *b;
        __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
@@ -193,10 +249,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 
        if (flags & BTREE_INSERT_USE_RESERVE) {
                nr_reserve      = 0;
-               alloc_reserve   = RESERVE_BTREE_MOVINGGC;
+               alloc_reserve   = RESERVE_btree_movinggc;
        } else {
                nr_reserve      = BTREE_NODE_RESERVE;
-               alloc_reserve   = RESERVE_BTREE;
+               alloc_reserve   = RESERVE_btree;
        }
 
        mutex_lock(&c->btree_reserve_cache_lock);
@@ -212,7 +268,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c,
+       wp = bch2_alloc_sectors_start_trans(trans,
                                      c->opts.metadata_target ?:
                                      c->opts.foreground_target,
                                      0,
@@ -242,7 +298,7 @@ retry:
        bch2_open_bucket_get(c, wp, &ob);
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, interior_node);
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 
@@ -256,22 +312,25 @@ mem_alloc:
        return b;
 }
 
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+                                          struct btree_trans *trans,
+                                          unsigned level)
 {
        struct bch_fs *c = as->c;
        struct btree *b;
+       struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
        int ret;
 
        BUG_ON(level >= BTREE_MAX_DEPTH);
-       BUG_ON(!as->nr_prealloc_nodes);
+       BUG_ON(!p->nr);
 
-       b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+       b = p->b[--p->nr];
 
-       six_lock_intent(&b->c.lock, NULL, NULL);
-       six_lock_write(&b->c.lock, NULL, NULL);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
        set_btree_node_accessed(b);
-       set_btree_node_dirty(c, b);
+       set_btree_node_dirty_acct(c, b);
        set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
@@ -301,7 +360,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
        BUG_ON(ret);
 
-       trace_btree_node_alloc(c, b);
+       trace_and_count(c, btree_node_alloc, c, b);
        return b;
 }
 
@@ -319,12 +378,13 @@ static void btree_set_max(struct btree *b, struct bpos pos)
 }
 
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+                                                 struct btree_trans *trans,
                                                  struct btree *b,
                                                  struct bkey_format format)
 {
        struct btree *n;
 
-       n = bch2_btree_node_alloc(as, b->c.level);
+       n = bch2_btree_node_alloc(as, trans, b->c.level);
 
        SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
@@ -343,6 +403,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 }
 
 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+                                                      struct btree_trans *trans,
                                                       struct btree *b)
 {
        struct bkey_format new_f = bch2_btree_calc_format(b);
@@ -354,12 +415,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
        if (!bch2_btree_node_format_fits(as->c, b, &new_f))
                new_f = b->format;
 
-       return __bch2_btree_node_alloc_replacement(as, b, new_f);
+       return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
 }
 
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+                               struct btree_trans *trans, unsigned level)
 {
-       struct btree *b = bch2_btree_node_alloc(as, level);
+       struct btree *b = bch2_btree_node_alloc(as, trans, level);
 
        btree_set_min(b, POS_MIN);
        btree_set_max(b, SPOS_MAX);
@@ -368,56 +430,57 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
        btree_node_set_format(b, b->data->format);
        bch2_btree_build_aux_trees(b);
 
-       bch2_btree_update_add_new_node(as, b);
-       six_unlock_write(&b->c.lock);
-
        return b;
 }
 
-static void bch2_btree_reserve_put(struct btree_update *as)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
 {
        struct bch_fs *c = as->c;
+       struct prealloc_nodes *p;
 
-       mutex_lock(&c->btree_reserve_cache_lock);
+       for (p = as->prealloc_nodes;
+            p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+            p++) {
+               while (p->nr) {
+                       struct btree *b = p->b[--p->nr];
 
-       while (as->nr_prealloc_nodes) {
-               struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+                       mutex_lock(&c->btree_reserve_cache_lock);
 
-               six_lock_intent(&b->c.lock, NULL, NULL);
-               six_lock_write(&b->c.lock, NULL, NULL);
+                       if (c->btree_reserve_cache_nr <
+                           ARRAY_SIZE(c->btree_reserve_cache)) {
+                               struct btree_alloc *a =
+                                       &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 
-               if (c->btree_reserve_cache_nr <
-                   ARRAY_SIZE(c->btree_reserve_cache)) {
-                       struct btree_alloc *a =
-                               &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+                               a->ob = b->ob;
+                               b->ob.nr = 0;
+                               bkey_copy(&a->k, &b->key);
+                       } else {
+                               bch2_open_buckets_put(c, &b->ob);
+                       }
 
-                       a->ob = b->ob;
-                       b->ob.nr = 0;
-                       bkey_copy(&a->k, &b->key);
-               } else {
-                       bch2_open_buckets_put(c, &b->ob);
-               }
+                       mutex_unlock(&c->btree_reserve_cache_lock);
 
-               __btree_node_free(c, b);
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
+                       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+                       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+                       __btree_node_free(c, b);
+                       six_unlock_write(&b->c.lock);
+                       six_unlock_intent(&b->c.lock);
+               }
        }
-
-       mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
-                                 unsigned flags)
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+                                 struct btree_update *as,
+                                 unsigned nr_nodes[2],
+                                 unsigned flags,
+                                 struct closure *cl)
 {
        struct bch_fs *c = as->c;
-       struct closure cl;
        struct btree *b;
-       int ret;
-
-       closure_init_stack(&cl);
-retry:
+       unsigned interior;
+       int ret = 0;
 
-       BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+       BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 
        /*
         * Protects reaping from the btree node cache and using the btree node
@@ -426,39 +489,33 @@ retry:
         * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
         * blocking on this lock:
         */
-       ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+       ret = bch2_btree_cache_cannibalize_lock(c, cl);
        if (ret)
-               goto err;
+               return ret;
+
+       for (interior = 0; interior < 2; interior++) {
+               struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+               while (p->nr < nr_nodes[interior]) {
+                       b = __bch2_btree_node_alloc(trans, &as->disk_res,
+                                       flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+                                       interior, flags);
+                       if (IS_ERR(b)) {
+                               ret = PTR_ERR(b);
+                               goto err;
+                       }
 
-       while (as->nr_prealloc_nodes < nr_nodes) {
-               b = __bch2_btree_node_alloc(c, &as->disk_res,
-                                           flags & BTREE_INSERT_NOWAIT
-                                           ? NULL : &cl, flags);
-               if (IS_ERR(b)) {
-                       ret = PTR_ERR(b);
-                       goto err;
+                       p->b[p->nr++] = b;
                }
-
-               as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
        }
-
-       bch2_btree_cache_cannibalize_unlock(c);
-       closure_sync(&cl);
-       return 0;
 err:
        bch2_btree_cache_cannibalize_unlock(c);
-       closure_sync(&cl);
-
-       if (ret == -EAGAIN)
-               goto retry;
-
-       trace_btree_reserve_get_fail(c, nr_nodes, &cl);
        return ret;
 }
 
 /* Asynchronous interior node update machinery */
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
 {
        struct bch_fs *c = as->c;
 
@@ -471,7 +528,7 @@ static void bch2_btree_update_free(struct btree_update *as)
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
        bch2_disk_reservation_put(c, &as->disk_res);
-       bch2_btree_reserve_put(as);
+       bch2_btree_reserve_put(as, trans);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
                               as->start_time);
@@ -492,20 +549,18 @@ static void bch2_btree_update_free(struct btree_update *as)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_will_delete_key(struct btree_update *as,
-                                        struct bkey_i *k)
+static void btree_update_add_key(struct btree_update *as,
+                                struct keylist *keys, struct btree *b)
 {
-       BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+       struct bkey_i *k = &b->key;
+
+       BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
               ARRAY_SIZE(as->_old_keys));
-       bch2_keylist_add(&as->old_keys, k);
-}
 
-static void btree_update_will_add_key(struct btree_update *as,
-                                     struct bkey_i *k)
-{
-       BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
-              ARRAY_SIZE(as->_new_keys));
-       bch2_keylist_add(&as->new_keys, k);
+       bkey_copy(keys->top, k);
+       bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+       bch2_keylist_push(keys);
 }
 
 /*
@@ -518,24 +573,29 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
        struct bkey_i *k;
        int ret;
 
-       trans->extra_journal_entries = (void *) &as->journal_entries[0];
-       trans->extra_journal_entry_u64s = as->journal_u64s;
+       ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+       if (ret)
+               return ret;
+
+       memcpy(&darray_top(trans->extra_journal_entries),
+              as->journal_entries,
+              as->journal_u64s * sizeof(u64));
+       trans->extra_journal_entries.nr += as->journal_u64s;
+
        trans->journal_pin = &as->journal;
 
-       for_each_keylist_key(&as->new_keys, k) {
-               ret = bch2_trans_mark_key(trans,
-                                         bkey_s_c_null,
-                                         bkey_i_to_s_c(k),
-                                         BTREE_TRIGGER_INSERT);
+       for_each_keylist_key(&as->old_keys, k) {
+               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+               ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
                if (ret)
                        return ret;
        }
 
-       for_each_keylist_key(&as->old_keys, k) {
-               ret = bch2_trans_mark_key(trans,
-                                         bkey_i_to_s_c(k),
-                                         bkey_s_c_null,
-                                         BTREE_TRIGGER_OVERWRITE);
+       for_each_keylist_key(&as->new_keys, k) {
+               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+               ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
                if (ret)
                        return ret;
        }
@@ -546,12 +606,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 static void btree_update_nodes_written(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
-       struct btree *b = as->b;
+       struct btree *b;
        struct btree_trans trans;
        u64 journal_seq = 0;
        unsigned i;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 512);
        /*
         * If we're already in an error state, it might be because a btree node
         * was never written, and we might be trying to free that same btree
@@ -563,22 +624,21 @@ static void btree_update_nodes_written(struct btree_update *as)
        if (ret)
                goto err;
 
-       BUG_ON(!journal_pin_active(&as->journal));
-
        /*
         * Wait for any in flight writes to finish before we free the old nodes
         * on disk:
         */
        for (i = 0; i < as->nr_old_nodes; i++) {
-               struct btree *old = as->old_nodes[i];
                __le64 seq;
 
-               six_lock_read(&old->c.lock, NULL, NULL);
-               seq = old->data ? old->data->keys.seq : 0;
-               six_unlock_read(&old->c.lock);
+               b = as->old_nodes[i];
+
+               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               seq = b->data ? b->data->keys.seq : 0;
+               six_unlock_read(&b->c.lock);
 
                if (seq == as->old_nodes_seq[i])
-                       wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+                       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
                                       TASK_UNINTERRUPTIBLE);
        }
 
@@ -595,19 +655,23 @@ static void btree_update_nodes_written(struct btree_update *as)
         * journal reclaim does btree updates when flushing bkey_cached entries,
         * which may require allocations as well.
         */
-       bch2_trans_init(&trans, c, 0, 512);
-       ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
-                             BTREE_INSERT_NOFAIL|
-                             BTREE_INSERT_NOCHECK_RW|
-                             BTREE_INSERT_JOURNAL_RECLAIM|
-                             BTREE_INSERT_JOURNAL_RESERVED,
-                             btree_update_nodes_written_trans(&trans, as));
-       bch2_trans_exit(&trans);
+       ret = commit_do(&trans, &as->disk_res, &journal_seq,
+                       BTREE_INSERT_NOFAIL|
+                       BTREE_INSERT_NOCHECK_RW|
+                       BTREE_INSERT_USE_RESERVE|
+                       BTREE_INSERT_JOURNAL_RECLAIM|
+                       JOURNAL_WATERMARK_reserved,
+                       btree_update_nodes_written_trans(&trans, as));
+       bch2_trans_unlock(&trans);
 
        bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
                             "error %i in btree_update_nodes_written()", ret);
 err:
-       if (b) {
+       if (as->b) {
+               struct btree_path *path;
+
+               b = as->b;
+               path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
                /*
                 * @b is the node we did the final insert into:
                 *
@@ -620,11 +684,28 @@ err:
                 * we're in journal error state:
                 */
 
-               six_lock_intent(&b->c.lock, NULL, NULL);
-               six_lock_write(&b->c.lock, NULL, NULL);
+               /*
+                * Ensure transaction is unlocked before using
+                * btree_node_lock_nopath() (the use of which is always suspect,
+                * we need to work on removing this in the future)
+                *
+                * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+                * calls bch2_path_upgrade(), before we call path_make_mut(), so
+                * we may rarely end up with a locked path besides the one we
+                * have here:
+                */
+               bch2_trans_unlock(&trans);
+               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
+               mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+               bch2_btree_path_level_init(&trans, path, b);
+
+               bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+
                mutex_lock(&c->btree_interior_update_lock);
 
                list_del(&as->write_blocked_list);
+               if (list_empty(&b->write_blocked))
+                       clear_btree_node_write_blocked(b);
 
                /*
                 * Node might have been freed, recheck under
@@ -638,8 +719,8 @@ err:
 
                        if (!ret) {
                                i->journal_seq = cpu_to_le64(
-                                       max(journal_seq,
-                                           le64_to_cpu(i->journal_seq)));
+                                                            max(journal_seq,
+                                                                le64_to_cpu(i->journal_seq)));
 
                                bch2_btree_add_journal_pin(c, b, journal_seq);
                        } else {
@@ -653,10 +734,13 @@ err:
                }
 
                mutex_unlock(&c->btree_interior_update_lock);
+
+               mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
                six_unlock_write(&b->c.lock);
 
                btree_node_write_if_need(c, b, SIX_LOCK_intent);
-               six_unlock_intent(&b->c.lock);
+               btree_node_unlock(&trans, path, b->c.level);
+               bch2_path_put(&trans, path, true);
        }
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -669,13 +753,14 @@ err:
 
                BUG_ON(b->will_make_reachable != (unsigned long) as);
                b->will_make_reachable = 0;
+               clear_btree_node_will_make_reachable(b);
        }
        mutex_unlock(&c->btree_interior_update_lock);
 
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
 
-               six_lock_read(&b->c.lock, NULL, NULL);
+               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
                btree_node_write_if_need(c, b, SIX_LOCK_read);
                six_unlock_read(&b->c.lock);
        }
@@ -683,7 +768,8 @@ err:
        for (i = 0; i < as->nr_open_buckets; i++)
                bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-       bch2_btree_update_free(as);
+       bch2_btree_update_free(as, &trans);
+       bch2_trans_exit(&trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -735,6 +821,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
        as->mode        = BTREE_INTERIOR_UPDATING_NODE;
        as->b           = b;
+
+       set_btree_node_write_blocked(b);
        list_add(&as->write_blocked_list, &b->write_blocked);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -800,10 +888,19 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
        as->new_nodes[as->nr_new_nodes++] = b;
        b->will_make_reachable = 1UL|(unsigned long) as;
+       set_btree_node_will_make_reachable(b);
 
        mutex_unlock(&c->btree_interior_update_lock);
 
-       btree_update_will_add_key(as, &b->key);
+       btree_update_add_key(as, &as->new_keys, b);
+
+       if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+               unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+               unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+                       cpu_to_le16(sectors);
+       }
 }
 
 /*
@@ -822,6 +919,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
         * xchg() is for synchronization with bch2_btree_complete_write:
         */
        v = xchg(&b->will_make_reachable, 0);
+       clear_btree_node_will_make_reachable(b);
        as = (struct btree_update *) (v & ~1UL);
 
        if (!as) {
@@ -855,7 +953,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
  * btree_updates to point to this btree_update:
  */
 static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-                                              struct btree *b)
+                                                     struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree_update *p, *n;
@@ -887,7 +985,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                closure_wake_up(&c->btree_interior_update_wait);
        }
 
-       clear_btree_node_dirty(c, b);
+       clear_btree_node_dirty_acct(c, b);
        clear_btree_node_need_write(b);
 
        /*
@@ -919,14 +1017,14 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         */
        btree_update_drop_new_node(c, b);
 
-       btree_update_will_delete_key(as, &b->key);
+       btree_update_add_key(as, &as->old_keys, b);
 
        as->old_nodes[as->nr_old_nodes] = b;
        as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
        as->nr_old_nodes++;
 }
 
-static void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
 {
        struct bch_fs *c = as->c;
        u64 start_time = as->start_time;
@@ -937,7 +1035,7 @@ static void bch2_btree_update_done(struct btree_update *as)
                up_read(&as->c->gc_lock);
        as->took_gc_lock = false;
 
-       bch2_btree_reserve_put(as);
+       bch2_btree_reserve_put(as, trans);
 
        continue_at(&as->cl, btree_update_set_nodes_written,
                    as->c->btree_interior_update_worker);
@@ -948,32 +1046,44 @@ static void bch2_btree_update_done(struct btree_update *as)
 
 static struct btree_update *
 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-                       unsigned level, unsigned nr_nodes, unsigned flags)
+                       unsigned level, bool split, unsigned flags)
 {
        struct bch_fs *c = trans->c;
        struct btree_update *as;
        u64 start_time = local_clock();
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
-       int journal_flags = 0;
+       unsigned nr_nodes[2] = { 0, 0 };
+       unsigned update_level = level;
+       int journal_flags = flags & JOURNAL_WATERMARK_MASK;
        int ret = 0;
+       u32 restart_count = trans->restart_count;
 
        BUG_ON(!path->should_be_locked);
 
-       if (flags & BTREE_INSERT_JOURNAL_RESERVED)
-               journal_flags |= JOURNAL_RES_GET_RESERVED;
        if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
                journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 
-       /*
-        * XXX: figure out how far we might need to split,
-        * instead of locking/reserving all the way to the root:
-        */
-       if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-               trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
-                                                path->btree_id, &path->pos);
-               ret = btree_trans_restart(trans);
-               return ERR_PTR(ret);
+       while (1) {
+               nr_nodes[!!update_level] += 1 + split;
+               update_level++;
+
+               ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+               if (ret)
+                       return ERR_PTR(ret);
+
+               if (!btree_path_node(path, update_level)) {
+                       /* Allocating new root? */
+                       nr_nodes[1] += split;
+                       update_level = BTREE_MAX_DEPTH;
+                       break;
+               }
+
+               if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+                                       BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+                       break;
+
+               split = true;
        }
 
        if (flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -981,9 +1091,10 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        else if (!down_read_trylock(&c->gc_lock)) {
                bch2_trans_unlock(trans);
                down_read(&c->gc_lock);
-               if (!bch2_trans_relock(trans)) {
+               ret = bch2_trans_relock(trans);
+               if (ret) {
                        up_read(&c->gc_lock);
-                       return ERR_PTR(-EINTR);
+                       return ERR_PTR(ret);
                }
        }
 
@@ -995,6 +1106,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
        as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
        as->btree_id    = path->btree_id;
+       as->update_level = update_level;
        INIT_LIST_HEAD(&as->list);
        INIT_LIST_HEAD(&as->unwritten_list);
        INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1018,41 +1130,66 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        if (ret)
                goto err;
 
-       bch2_trans_unlock(trans);
-
        ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                      BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags);
+                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
        if (ret) {
-               bch2_btree_update_free(as);
-               trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
-               btree_trans_restart(trans);
-               return ERR_PTR(ret);
+               bch2_trans_unlock(trans);
+
+               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
+                       goto err;
+               }
+
+               ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+                                             BTREE_UPDATE_JOURNAL_RES,
+                                             journal_flags);
+               if (ret) {
+                       trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
+                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
+                       goto err;
+               }
+
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       goto err;
        }
 
        ret = bch2_disk_reservation_get(c, &as->disk_res,
-                       nr_nodes * btree_sectors(c),
+                       (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
                        c->opts.metadata_replicas,
                        disk_res_flags);
        if (ret)
                goto err;
 
-       ret = bch2_btree_reserve_get(as, nr_nodes, flags);
-       if (ret)
-               goto err;
+       ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+       if (bch2_err_matches(ret, ENOSPC) ||
+           bch2_err_matches(ret, ENOMEM)) {
+               struct closure cl;
+
+               closure_init_stack(&cl);
+
+               do {
+                       ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
 
-       if (!bch2_trans_relock(trans)) {
-               ret = -EINTR;
+                       bch2_trans_unlock(trans);
+                       closure_sync(&cl);
+               } while (ret == -EAGAIN);
+       }
+
+       if (ret) {
+               trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
                goto err;
        }
 
-       bch2_journal_pin_add(&c->journal,
-                            atomic64_read(&c->journal.seq),
-                            &as->journal, NULL);
+       ret = bch2_trans_relock(trans);
+       if (ret)
+               goto err;
 
+       bch2_trans_verify_not_restarted(trans, restart_count);
        return as;
 err:
-       bch2_btree_update_free(as);
+       bch2_btree_update_free(as, trans);
        return ERR_PTR(ret);
 }
 
@@ -1065,11 +1202,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        list_del_init(&b->list);
        mutex_unlock(&c->btree_cache.lock);
 
-       if (b->c.level)
-               six_lock_pcpu_alloc(&b->c.lock);
-       else
-               six_lock_pcpu_free(&b->c.lock);
-
        mutex_lock(&c->btree_root_lock);
        BUG_ON(btree_node_root(c, b) &&
               (b->c.level < btree_node_root(c, b)->c.level ||
@@ -1101,9 +1233,7 @@ static void bch2_btree_set_root(struct btree_update *as,
        struct bch_fs *c = as->c;
        struct btree *old;
 
-       trace_btree_set_root(c, b);
-       BUG_ON(!b->written &&
-              !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+       trace_and_count(c, btree_node_set_root, c, b);
 
        old = btree_node_root(c, b);
 
@@ -1111,7 +1241,7 @@ static void bch2_btree_set_root(struct btree_update *as,
         * Ensure no one is using the old root while we switch to the
         * new root:
         */
-       bch2_btree_node_lock_write(trans, path, old);
+       bch2_btree_node_lock_write_nofail(trans, path, &old->c);
 
        bch2_btree_set_root_inmem(c, b);
 
@@ -1138,7 +1268,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 {
        struct bch_fs *c = as->c;
        struct bkey_packed *k;
-       const char *invalid;
+       struct printbuf buf = PRINTBUF;
 
        BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
               !btree_ptr_sectors_written(insert));
@@ -1146,13 +1276,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
        if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
                bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
 
-       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
-               bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
-       if (invalid) {
-               char buf[160];
-
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
-               bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+       if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+                             btree_node_type(b), WRITE, &buf) ?:
+           bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+               printbuf_reset(&buf);
+               prt_printf(&buf, "inserting invalid bkey\n  ");
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+               prt_printf(&buf, "\n  ");
+               bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+                                 btree_node_type(b), WRITE, &buf);
+               bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+
+               bch2_fs_inconsistent(c, "%s", buf.buf);
                dump_stack();
        }
 
@@ -1170,8 +1305,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
                bch2_btree_node_iter_advance(node_iter, b);
 
        bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-       set_btree_node_dirty(c, b);
+       set_btree_node_dirty_acct(c, b);
        set_btree_node_need_write(b);
+
+       printbuf_exit(&buf);
 }
 
 static void
@@ -1203,6 +1340,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_update *as,
+                                       struct btree_trans *trans,
                                        struct btree *n1)
 {
        struct bkey_format_state s;
@@ -1212,8 +1350,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
        struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
        struct bpos n1_pos;
 
-       n2 = bch2_btree_node_alloc(as, n1->c.level);
-       bch2_btree_update_add_new_node(as, n2);
+       n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
@@ -1361,38 +1498,49 @@ static void btree_split_insert_keys(struct btree_update *as,
        btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as, struct btree_trans *trans,
-                       struct btree_path *path, struct btree *b,
-                       struct keylist *keys, unsigned flags)
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+                      struct btree_path *path, struct btree *b,
+                      struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
        struct btree *parent = btree_node_parent(path, b);
        struct btree *n1, *n2 = NULL, *n3 = NULL;
+       struct btree_path *path1 = NULL, *path2 = NULL;
        u64 start_time = local_clock();
+       int ret = 0;
 
        BUG_ON(!parent && (b != btree_node_root(c, b)));
-       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+       BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
 
        bch2_btree_interior_update_will_free_node(as, b);
 
-       n1 = bch2_btree_node_alloc_replacement(as, b);
-       bch2_btree_update_add_new_node(as, n1);
+       n1 = bch2_btree_node_alloc_replacement(as, trans, b);
 
        if (keys)
                btree_split_insert_keys(as, trans, path, n1, keys);
 
        if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
-               trace_btree_split(c, b);
+               trace_and_count(c, btree_node_split, c, b);
 
-               n2 = __btree_split_node(as, n1);
+               n2 = __btree_split_node(as, trans, n1);
 
                bch2_btree_build_aux_trees(n2);
                bch2_btree_build_aux_trees(n1);
+
+               bch2_btree_update_add_new_node(as, n1);
+               bch2_btree_update_add_new_node(as, n2);
                six_unlock_write(&n2->c.lock);
                six_unlock_write(&n1->c.lock);
 
-               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-               bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+               path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+               six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               bch2_btree_path_level_init(trans, path1, n1);
+
+               path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+               six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+               bch2_btree_path_level_init(trans, path2, n2);
 
                /*
                 * Note that on recursive parent_keys == keys, so we
@@ -1404,22 +1552,33 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
                if (!parent) {
                        /* Depth increases, make a new root */
-                       n3 = __btree_root_alloc(as, b->c.level + 1);
+                       n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+                       bch2_btree_update_add_new_node(as, n3);
+                       six_unlock_write(&n3->c.lock);
+
+                       path2->locks_want++;
+                       BUG_ON(btree_node_locked(path2, n3->c.level));
+                       six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+                       mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+                       bch2_btree_path_level_init(trans, path2, n3);
 
                        n3->sib_u64s[0] = U16_MAX;
                        n3->sib_u64s[1] = U16_MAX;
 
                        btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-
-                       bch2_btree_node_write(c, n3, SIX_LOCK_intent);
                }
        } else {
-               trace_btree_compact(c, b);
+               trace_and_count(c, btree_node_compact, c, b);
 
                bch2_btree_build_aux_trees(n1);
+               bch2_btree_update_add_new_node(as, n1);
                six_unlock_write(&n1->c.lock);
 
-               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+               path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+               six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               bch2_btree_path_level_init(trans, path1, n1);
 
                if (parent)
                        bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1429,7 +1588,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
        if (parent) {
                /* Split a non root node */
-               bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+               ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+               if (ret)
+                       goto err;
        } else if (n3) {
                bch2_btree_set_root(as, trans, path, n3);
        } else {
@@ -1437,20 +1598,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                bch2_btree_set_root(as, trans, path, n1);
        }
 
-       bch2_btree_update_get_open_buckets(as, n1);
-       if (n2)
-               bch2_btree_update_get_open_buckets(as, n2);
-       if (n3)
+       if (n3) {
                bch2_btree_update_get_open_buckets(as, n3);
-
-       /* Successful split, update the path to point to the new nodes: */
-
-       six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       if (n3)
-               bch2_trans_node_add(trans, n3);
-       if (n2)
-               bch2_trans_node_add(trans, n2);
-       bch2_trans_node_add(trans, n1);
+               bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+       }
+       if (n2) {
+               bch2_btree_update_get_open_buckets(as, n2);
+               bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+       }
+       bch2_btree_update_get_open_buckets(as, n1);
+       bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
        /*
         * The old node must be freed (in memory) _before_ unlocking the new
@@ -1458,13 +1615,28 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
         * node after another thread has locked and updated the new node, thus
         * seeing stale data:
         */
-       bch2_btree_node_free_inmem(trans, b);
+       bch2_btree_node_free_inmem(trans, path, b);
+
+       if (n3)
+               bch2_trans_node_add(trans, n3);
+       if (n2)
+               bch2_trans_node_add(trans, n2);
+       bch2_trans_node_add(trans, n1);
 
        if (n3)
                six_unlock_intent(&n3->c.lock);
        if (n2)
                six_unlock_intent(&n2->c.lock);
        six_unlock_intent(&n1->c.lock);
+out:
+       if (path2) {
+               __bch2_btree_path_unlock(trans, path2);
+               bch2_path_put(trans, path2, true);
+       }
+       if (path1) {
+               __bch2_btree_path_unlock(trans, path1);
+               bch2_path_put(trans, path1, true);
+       }
 
        bch2_trans_verify_locks(trans);
 
@@ -1472,6 +1644,14 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                               ? BCH_TIME_btree_node_split
                               : BCH_TIME_btree_node_compact],
                               start_time);
+       return ret;
+err:
+       if (n3)
+               bch2_btree_node_free_never_used(as, trans, n3);
+       if (n2)
+               bch2_btree_node_free_never_used(as, trans, n2);
+       bch2_btree_node_free_never_used(as, trans, n1);
+       goto out;
 }
 
 static void
@@ -1506,22 +1686,30 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-                                  struct btree_path *path, struct btree *b,
-                                  struct keylist *keys, unsigned flags)
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+                                 struct btree_path *path, struct btree *b,
+                                 struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
        int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
+       int ret;
 
        lockdep_assert_held(&c->gc_lock);
-       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+       BUG_ON(!btree_node_intent_locked(path, b->c.level));
        BUG_ON(!b->c.level);
        BUG_ON(!as || as->b);
        bch2_verify_keylist_sorted(keys);
 
-       bch2_btree_node_lock_for_insert(trans, path, b);
+       if (!(local_clock() & 63))
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
+       ret = bch2_btree_node_lock_write(trans, path, &b->c);
+       if (ret)
+               return ret;
+
+       bch2_btree_node_prep_for_write(trans, path, b);
 
        if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
                bch2_btree_node_unlock_write(trans, path, b);
@@ -1547,30 +1735,41 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *
        bch2_btree_node_unlock_write(trans, path, b);
 
        btree_node_interior_verify(c, b);
-       return;
+       return 0;
 split:
-       btree_split(as, trans, path, b, keys, flags);
+       /*
+        * We could attempt to avoid the transaction restart, by calling
+        * bch2_btree_path_upgrade() and allocating more nodes:
+        */
+       if (b->c.level >= as->update_level)
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
+       return btree_split(as, trans, path, b, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
                          struct btree_path *path,
                          unsigned flags)
 {
-       struct bch_fs *c = trans->c;
        struct btree *b = path_l(path)->b;
        struct btree_update *as;
        unsigned l;
        int ret = 0;
 
        as = bch2_btree_update_start(trans, path, path->level,
-               btree_update_reserve_required(c, b), flags);
+                                    true, flags);
        if (IS_ERR(as))
                return PTR_ERR(as);
 
-       btree_split(as, trans, path, b, NULL, flags);
-       bch2_btree_update_done(as);
+       ret = btree_split(as, trans, path, b, NULL, flags);
+       if (ret) {
+               bch2_btree_update_free(as, trans);
+               return ret;
+       }
+
+       bch2_btree_update_done(as, trans);
 
-       for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+       for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
                ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
        return ret;
@@ -1583,7 +1782,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
                                  enum btree_node_sibling sib)
 {
        struct bch_fs *c = trans->c;
-       struct btree_path *sib_path = NULL;
+       struct btree_path *sib_path = NULL, *new_path = NULL;
        struct btree_update *as;
        struct bkey_format_state new_s;
        struct bkey_format new_f;
@@ -1615,7 +1814,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       sib_path->should_be_locked = true;
+       btree_path_set_should_be_locked(sib_path);
 
        m = sib_path->l[level].b;
 
@@ -1634,15 +1833,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        }
 
        if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
-               char buf1[100], buf2[100];
+               struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-               bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
-               bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+               bch2_bpos_to_text(&buf1, prev->data->max_key);
+               bch2_bpos_to_text(&buf2, next->data->min_key);
                bch_err(c,
                        "btree topology error in btree merge:\n"
                        "  prev ends at   %s\n"
                        "  next starts at %s",
-                       buf1, buf2);
+                       buf1.buf, buf2.buf);
+               printbuf_exit(&buf1);
+               printbuf_exit(&buf2);
                bch2_topology_error(c);
                ret = -EIO;
                goto err;
@@ -1672,36 +1873,42 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
                goto out;
 
        parent = btree_node_parent(path, b);
-       as = bch2_btree_update_start(trans, path, level,
-                        btree_update_reserve_required(c, parent) + 1,
-                        flags|
+       as = bch2_btree_update_start(trans, path, level, false,
                         BTREE_INSERT_NOFAIL|
-                        BTREE_INSERT_USE_RESERVE);
+                        BTREE_INSERT_USE_RESERVE|
+                        flags);
        ret = PTR_ERR_OR_ZERO(as);
        if (ret)
                goto err;
 
-       trace_btree_merge(c, b);
+       trace_and_count(c, btree_node_merge, c, b);
 
        bch2_btree_interior_update_will_free_node(as, b);
        bch2_btree_interior_update_will_free_node(as, m);
 
-       n = bch2_btree_node_alloc(as, b->c.level);
-       bch2_btree_update_add_new_node(as, n);
+       n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+       SET_BTREE_NODE_SEQ(n->data,
+                          max(BTREE_NODE_SEQ(b->data),
+                              BTREE_NODE_SEQ(m->data)) + 1);
 
        btree_set_min(n, prev->data->min_key);
        btree_set_max(n, next->data->max_key);
-       n->data->format         = new_f;
 
+       n->data->format  = new_f;
        btree_node_set_format(n, new_f);
 
        bch2_btree_sort_into(c, n, prev);
        bch2_btree_sort_into(c, n, next);
 
        bch2_btree_build_aux_trees(n);
+       bch2_btree_update_add_new_node(as, n);
        six_unlock_write(&n->c.lock);
 
-       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+       new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       bch2_btree_path_level_init(trans, new_path, n);
 
        bkey_init(&delete.k);
        delete.k.p = prev->key.k.p;
@@ -1710,32 +1917,38 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        bch2_trans_verify_paths(trans);
 
-       bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+       ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+       if (ret)
+               goto err_free_update;
 
        bch2_trans_verify_paths(trans);
 
        bch2_btree_update_get_open_buckets(as, n);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
-       six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       six_lock_increment(&m->c.lock, SIX_LOCK_intent);
+       bch2_btree_node_free_inmem(trans, path, b);
+       bch2_btree_node_free_inmem(trans, sib_path, m);
 
        bch2_trans_node_add(trans, n);
 
        bch2_trans_verify_paths(trans);
 
-       bch2_btree_node_free_inmem(trans, b);
-       bch2_btree_node_free_inmem(trans, m);
-
        six_unlock_intent(&n->c.lock);
 
-       bch2_btree_update_done(as);
+       bch2_btree_update_done(as, trans);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
+       if (new_path)
+               bch2_path_put(trans, new_path, true);
        bch2_path_put(trans, sib_path, true);
        bch2_trans_verify_locks(trans);
        return ret;
+err_free_update:
+       bch2_btree_node_free_never_used(as, trans, n);
+       bch2_btree_update_free(as, trans);
+       goto out;
 }
 
 /**
@@ -1747,6 +1960,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
                            unsigned flags)
 {
        struct bch_fs *c = trans->c;
+       struct btree_path *new_path = NULL;
        struct btree *n, *parent;
        struct btree_update *as;
        int ret;
@@ -1755,47 +1969,54 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
        parent = btree_node_parent(iter->path, b);
        as = bch2_btree_update_start(trans, iter->path, b->c.level,
-               (parent
-                ? btree_update_reserve_required(c, parent)
-                : 0) + 1,
-               flags);
+                                    false, flags);
        ret = PTR_ERR_OR_ZERO(as);
-       if (ret) {
-               trace_btree_gc_rewrite_node_fail(c, b);
+       if (ret)
                goto out;
-       }
 
        bch2_btree_interior_update_will_free_node(as, b);
 
-       n = bch2_btree_node_alloc_replacement(as, b);
-       bch2_btree_update_add_new_node(as, n);
+       n = bch2_btree_node_alloc_replacement(as, trans, b);
 
        bch2_btree_build_aux_trees(n);
+       bch2_btree_update_add_new_node(as, n);
        six_unlock_write(&n->c.lock);
 
-       trace_btree_gc_rewrite_node(c, b);
+       new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       bch2_btree_path_level_init(trans, new_path, n);
 
-       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+       trace_and_count(c, btree_node_rewrite, c, b);
 
        if (parent) {
                bch2_keylist_add(&as->parent_keys, &n->key);
-               bch2_btree_insert_node(as, trans, iter->path, parent,
-                                      &as->parent_keys, flags);
+               ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+                                            &as->parent_keys, flags);
+               if (ret)
+                       goto err;
        } else {
                bch2_btree_set_root(as, trans, iter->path, n);
        }
 
        bch2_btree_update_get_open_buckets(as, n);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+       bch2_btree_node_free_inmem(trans, iter->path, b);
 
-       six_lock_increment(&b->c.lock, SIX_LOCK_intent);
        bch2_trans_node_add(trans, n);
-       bch2_btree_node_free_inmem(trans, b);
        six_unlock_intent(&n->c.lock);
 
-       bch2_btree_update_done(as);
+       bch2_btree_update_done(as, trans);
 out:
-       bch2_btree_path_downgrade(iter->path);
+       if (new_path)
+               bch2_path_put(trans, new_path, true);
+       bch2_btree_path_downgrade(trans, iter->path);
        return ret;
+err:
+       bch2_btree_node_free_never_used(as, trans, n);
+       bch2_btree_update_free(as, trans);
+       goto out;
 }
 
 struct async_btree_rewrite {
@@ -1825,7 +2046,7 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans,
                goto out;
 
        ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-out :
+out:
        bch2_trans_iter_exit(trans, &iter);
 
        return ret;
@@ -1847,7 +2068,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
        struct async_btree_rewrite *a;
 
-       if (!percpu_ref_tryget(&c->writes))
+       if (!percpu_ref_tryget_live(&c->writes))
                return;
 
        a = kmalloc(sizeof(*a), GFP_NOFS);
@@ -1875,21 +2096,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter2 = { NULL };
        struct btree *parent;
-       u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
        int ret;
 
        if (!skip_triggers) {
-               ret = bch2_trans_mark_key(trans,
-                                         bkey_s_c_null,
-                                         bkey_i_to_s_c(new_key),
-                                         BTREE_TRIGGER_INSERT);
+               ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+                                         bkey_i_to_s_c(&b->key), 0);
                if (ret)
                        return ret;
 
-               ret = bch2_trans_mark_key(trans,
-                                         bkey_i_to_s_c(&b->key),
-                                         bkey_s_c_null,
-                                         BTREE_TRIGGER_OVERWRITE);
+               ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+                                         new_key, 0);
                if (ret)
                        return ret;
        }
@@ -1912,9 +2128,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                BUG_ON(iter2.path->level != b->c.level);
                BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-               btree_node_unlock(iter2.path, iter2.path->level);
-               path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
-               iter2.path->level++;
+               btree_path_set_level_up(trans, iter2.path);
+
+               bch2_btree_path_check_sort(trans, iter2.path, 0);
 
                ret   = bch2_btree_iter_traverse(&iter2) ?:
                        bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
@@ -1923,12 +2139,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        } else {
                BUG_ON(btree_node_root(c, b) != b);
 
-               trans->extra_journal_entries = (void *) &journal_entries[0];
-               trans->extra_journal_entry_u64s =
-                       journal_entry_set((void *) &journal_entries[0],
-                                         BCH_JSET_ENTRY_btree_root,
-                                         b->c.btree_id, b->c.level,
-                                         new_key, new_key->k.u64s);
+               ret = darray_make_room(&trans->extra_journal_entries,
+                                      jset_u64s(new_key->k.u64s));
+               if (ret)
+                       return ret;
+
+               journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+                                 BCH_JSET_ENTRY_btree_root,
+                                 b->c.btree_id, b->c.level,
+                                 new_key, new_key->k.u64s);
+               trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
        }
 
        ret = bch2_trans_commit(trans, NULL, NULL,
@@ -1936,11 +2156,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_USE_RESERVE|
                                BTREE_INSERT_JOURNAL_RECLAIM|
-                               BTREE_INSERT_JOURNAL_RESERVED);
+                               JOURNAL_WATERMARK_reserved);
        if (ret)
                goto err;
 
-       bch2_btree_node_lock_write(trans, iter->path, b);
+       bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
 
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
@@ -1978,11 +2198,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
        struct closure cl;
        int ret = 0;
 
-       if (!btree_node_intent_locked(path, b->c.level) &&
-           !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
-               btree_trans_restart(trans);
-               return -EINTR;
-       }
+       ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+       if (ret)
+               return ret;
 
        closure_init_stack(&cl);
 
@@ -1995,11 +2213,12 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
                if (ret) {
                        bch2_trans_unlock(trans);
                        closure_sync(&cl);
-                       if (!bch2_trans_relock(trans))
-                               return -EINTR;
+                       ret = bch2_trans_relock(trans);
+                       if (ret)
+                               return ret;
                }
 
-               new_hash = bch2_btree_node_mem_alloc(c);
+               new_hash = bch2_btree_node_mem_alloc(c, false);
        }
 
        path->intent_ref++;
@@ -2075,7 +2294,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
                closure_sync(&cl);
        } while (ret);
 
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, false);
        bch2_btree_cache_cannibalize_unlock(c);
 
        set_btree_node_fake(b);
@@ -2112,7 +2331,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->btree_interior_update_lock);
        list_for_each_entry(as, &c->btree_interior_update_list, list)
-               pr_buf(out, "%p m %u w %u r %u j %llu\n",
+               prt_printf(out, "%p m %u w %u r %u j %llu\n",
                       as,
                       as->mode,
                       as->nodes_written,
@@ -2121,19 +2340,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
 {
-       size_t ret = 0;
-       struct list_head *i;
+       bool ret;
 
        mutex_lock(&c->btree_interior_update_lock);
-       list_for_each(i, &c->btree_interior_update_list)
-               ret++;
+       ret = !list_empty(&c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return ret;
 }
 
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+       bool ret = bch2_btree_interior_updates_pending(c);
+
+       if (ret)
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_pending(c));
+       return ret;
+}
+
 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
 {
        struct btree_root *r;
index 8dc86fa636d680900034d8c1b9efd0c1374d0b15..dabe815965445484d2a24c7ab801d7bf0e19049a 100644 (file)
@@ -52,6 +52,7 @@ struct btree_update {
        unsigned                        took_gc_lock:1;
 
        enum btree_id                   btree_id;
+       unsigned                        update_level;
 
        struct disk_reservation         disk_res;
        struct journal_preres           journal_preres;
@@ -76,8 +77,10 @@ struct btree_update {
        struct journal_entry_pin        journal;
 
        /* Preallocated nodes we reserve when we start the update: */
-       struct btree                    *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
-       unsigned                        nr_prealloc_nodes;
+       struct prealloc_nodes {
+               struct btree            *b[BTREE_UPDATE_NODES_MAX];
+               unsigned                nr;
+       }                               prealloc_nodes[2];
 
        /* Nodes being freed: */
        struct keylist                  old_keys;
@@ -115,6 +118,7 @@ struct btree_update {
 };
 
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+                                                 struct btree_trans *,
                                                  struct btree *,
                                                  struct bkey_format);
 
@@ -307,7 +311,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 
 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
index 4b37a4869873998a7301265999480364c2df0429..3a68382013e79620351e0ba17bcd0e45708f4779 100644 (file)
@@ -10,6 +10,7 @@
 #include "btree_locking.h"
 #include "buckets.h"
 #include "debug.h"
+#include "errcode.h"
 #include "error.h"
 #include "extent_update.h"
 #include "journal.h"
@@ -31,6 +32,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                                         const struct btree_insert_entry *r)
 {
        return   cmp_int(l->btree_id,   r->btree_id) ?:
+                cmp_int(l->cached,     r->cached) ?:
                 -cmp_int(l->level,     r->level) ?:
                 bpos_cmp(l->k->k.p,    r->k->k.p);
 }
@@ -54,9 +56,9 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
                insert_l(&i[0])->b == insert_l(&i[1])->b;
 }
 
-static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-                                                 struct btree_path *path,
-                                                 struct btree *b)
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+                                          struct btree_path *path,
+                                          struct btree *b)
 {
        struct bch_fs *c = trans->c;
 
@@ -75,14 +77,6 @@ static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
                bch2_btree_init_next(trans, b);
 }
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-                                    struct btree_path *path,
-                                    struct btree *b)
-{
-       bch2_btree_node_lock_write(trans, path, b);
-       bch2_btree_node_prep_for_write(trans, path, b);
-}
-
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -167,11 +161,30 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct btree_write *w = container_of(pin, struct btree_write, journal);
        struct btree *b = container_of(w, struct btree, writes[i]);
+       struct btree_trans trans;
+       unsigned long old, new, v;
+       unsigned idx = w - b->writes;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+       v = READ_ONCE(b->flags);
+
+       do {
+               old = new = v;
 
-       six_lock_read(&b->c.lock, NULL, NULL);
-       bch2_btree_node_write_cond(c, b,
-               (btree_current_write(b) == w && w->journal.seq == seq));
+               if (!(old & (1 << BTREE_NODE_dirty)) ||
+                   !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+                   w->journal.seq != seq)
+                       break;
+
+               new |= 1 << BTREE_NODE_need_write;
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       btree_node_write_if_need(c, b, SIX_LOCK_read);
        six_unlock_read(&b->c.lock);
+
+       bch2_trans_exit(&trans);
        return 0;
 }
 
@@ -199,7 +212,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static bool btree_insert_key_leaf(struct btree_trans *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
                                  struct btree_insert_entry *insert)
 {
        struct bch_fs *c = trans->c;
@@ -212,7 +225,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
        if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
                                        &insert_l(insert)->iter, insert->k)))
-               return false;
+               return;
 
        i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
                                         le64_to_cpu(i->journal_seq)));
@@ -220,7 +233,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
        bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
        if (unlikely(!btree_node_dirty(b)))
-               set_btree_node_dirty(c, b);
+               set_btree_node_dirty_acct(c, b);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -233,8 +246,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
        if (u64s_added > live_u64s_added &&
            bch2_maybe_compact_whiteouts(c, b))
                bch2_trans_node_reinit_iter(trans, b);
-
-       return true;
 }
 
 /* Cached btree updates: */
@@ -269,9 +280,10 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
        if (ret)
                return ret;
 
-       if (!bch2_trans_relock(trans)) {
-               trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
-               return -EINTR;
+       ret = bch2_trans_relock(trans);
+       if (ret) {
+               trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
+               return ret;
        }
 
        return 0;
@@ -283,39 +295,28 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        int ret;
 
-       if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-               flags |= JOURNAL_RES_GET_RESERVED;
-
        ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
-                                  trans->journal_u64s, flags);
+                                  trans->journal_u64s,
+                                  flags|
+                                  (trans->flags & JOURNAL_WATERMARK_MASK));
 
        return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
 
 #define JSET_ENTRY_LOG_U64s            4
 
-static noinline void journal_transaction_name(struct btree_trans *trans)
+static void journal_transaction_name(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
-       struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
-       struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-       unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
-       unsigned b, buflen = u64s * sizeof(u64);
-
-       l->entry.u64s           = cpu_to_le16(u64s);
-       l->entry.btree_id       = 0;
-       l->entry.level          = 0;
-       l->entry.type           = BCH_JSET_ENTRY_log;
-       l->entry.pad[0]         = 0;
-       l->entry.pad[1]         = 0;
-       l->entry.pad[2]         = 0;
-       b = min_t(unsigned, strlen(trans->fn), buflen);
-       memcpy(l->d, trans->fn, b);
-       while (b < buflen)
-               l->d[b++] = '\0';
-
-       trans->journal_res.offset       += JSET_ENTRY_LOG_U64s;
-       trans->journal_res.u64s         -= JSET_ENTRY_LOG_U64s;
+       struct journal *j = &c->journal;
+       struct jset_entry *entry =
+               bch2_journal_add_entry(j, &trans->journal_res,
+                                      BCH_JSET_ENTRY_log, 0, 0,
+                                      JSET_ENTRY_LOG_U64s);
+       struct jset_entry_log *l =
+               container_of(entry, struct jset_entry_log, entry);
+
+       strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
 }
 
 static inline enum btree_insert_ret
@@ -367,39 +368,162 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
        ck->u64s        = new_u64s;
        ck->k           = new_k;
-       return BTREE_INSERT_OK;
+       return 0;
 }
 
-static inline void do_btree_insert_one(struct btree_trans *trans,
-                                      struct btree_insert_entry *i)
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+                              struct btree_insert_entry *i,
+                              unsigned flags)
 {
-       struct bch_fs *c = trans->c;
-       struct journal *j = &c->journal;
-       bool did_work;
+       struct bkey_s_c old = { &i->old_k, i->old_v };
+       struct bkey_i *new = i->k;
+       int ret;
 
-       EBUG_ON(trans->journal_res.ref !=
-               !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+       if (unlikely(flags & BTREE_TRIGGER_NORUN))
+               return 0;
 
-       i->k->k.needs_whiteout = false;
+       if (!btree_node_type_needs_gc(i->btree_id))
+               return 0;
 
-       did_work = !i->cached
-               ? btree_insert_key_leaf(trans, i)
-               : bch2_btree_insert_key_cached(trans, i->path, i->k);
-       if (!did_work)
-               return;
+       if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+           bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+               ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
+                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+       } else {
+               struct bkey             _deleted = KEY(0, 0, 0);
+               struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
 
-       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               bch2_journal_add_keys(j, &trans->journal_res,
-                                     i->btree_id,
-                                     i->level,
-                                     i->k);
+               _deleted.p = i->path->pos;
 
-               if (trans->journal_seq)
-                       *trans->journal_seq = trans->journal_res.seq;
+               ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
+                               BTREE_TRIGGER_INSERT|flags) ?:
+                       bch2_mark_key(trans, old, deleted,
+                               BTREE_TRIGGER_OVERWRITE|flags);
+       }
+
+       return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+                                bool overwrite)
+{
+       /*
+        * Transactional triggers create new btree_insert_entries, so we can't
+        * pass them a pointer to a btree_insert_entry, that memory is going to
+        * move:
+        */
+       struct bkey old_k = i->old_k;
+       struct bkey_s_c old = { &old_k, i->old_v };
+
+       if ((i->flags & BTREE_TRIGGER_NORUN) ||
+           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+               return 0;
+
+       if (!i->insert_trigger_run &&
+           !i->overwrite_trigger_run &&
+           bch2_bkey_ops[old.k->type].trans_trigger ==
+           bch2_bkey_ops[i->k->k.type].trans_trigger &&
+           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+               i->overwrite_trigger_run = true;
+               i->insert_trigger_run = true;
+               return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+                                          BTREE_TRIGGER_INSERT|
+                                          BTREE_TRIGGER_OVERWRITE|
+                                          i->flags) ?: 1;
+       } else if (overwrite && !i->overwrite_trigger_run) {
+               i->overwrite_trigger_run = true;
+               return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+       } else if (!overwrite && !i->insert_trigger_run) {
+               i->insert_trigger_run = true;
+               return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+       } else {
+               return 0;
+       }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+                             struct btree_insert_entry *btree_id_start)
+{
+       struct btree_insert_entry *i;
+       bool trans_trigger_run;
+       int ret, overwrite;
+
+       for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+               /*
+                * Running triggers will append more updates to the list of updates as
+                * we're walking it:
+                */
+               do {
+                       trans_trigger_run = false;
+
+                       for (i = btree_id_start;
+                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+                            i++) {
+                               if (i->btree_id != btree_id)
+                                       continue;
+
+                               ret = run_one_trans_trigger(trans, i, overwrite);
+                               if (ret < 0)
+                                       return ret;
+                               if (ret)
+                                       trans_trigger_run = true;
+                       }
+               } while (trans_trigger_run);
+       }
+
+       return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+       unsigned btree_id = 0;
+       int ret = 0;
+
+       /*
+        *
+        * For a given btree, this algorithm runs insert triggers before
+        * overwrite triggers: this is so that when extents are being moved
+        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+        * they are re-added.
+        */
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               if (btree_id == BTREE_ID_alloc)
+                       continue;
+
+               while (btree_id_start < trans->updates + trans->nr_updates &&
+                      btree_id_start->btree_id < btree_id)
+                       btree_id_start++;
+
+               ret = run_btree_triggers(trans, btree_id, btree_id_start);
+               if (ret)
+                       return ret;
        }
+
+       trans_for_each_update(trans, i) {
+               if (i->btree_id > BTREE_ID_alloc)
+                       break;
+               if (i->btree_id == BTREE_ID_alloc) {
+                       ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+                       if (ret)
+                               return ret;
+                       break;
+               }
+       }
+
+       trans_for_each_update(trans, i)
+               BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+                      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+       return 0;
 }
 
-static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
@@ -413,8 +537,7 @@ static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
                BUG_ON(i->cached || i->level);
 
                if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
-                       ret = bch2_mark_update(trans, i->path, i->k,
-                                              i->flags|BTREE_TRIGGER_GC);
+                       ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
                        if (ret)
                                break;
                }
@@ -436,9 +559,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
        int ret;
 
        if (race_fault()) {
-               trace_trans_restart_fault_inject(trans->fn, trace_ip);
-               trans->restarted = true;
-               return -EINTR;
+               trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+               return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
        }
 
        /*
@@ -473,6 +595,33 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
                if (btree_node_type_needs_gc(i->bkey_type))
                        marking = true;
+
+               /*
+                * Revalidate before calling mem triggers - XXX, ugly:
+                *
+                * - successful btree node splits don't cause transaction
+                *   restarts and will have invalidated the pointer to the bkey
+                *   value
+                * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
+                *   when it has to resort
+                * - btree_key_can_insert_cached() when it has to reallocate
+                *
+                *   Ugly because we currently have no way to tell if the
+                *   pointer's been invalidated, which means it's debatabale
+                *   whether we should be stashing the old key at all.
+                */
+               i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+               if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+                       struct bkey_i *j_k =
+                               bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+                                                           i->k->k.p);
+
+                       if (j_k) {
+                               i->old_k = j_k->k;
+                               i->old_v = &j_k->v;
+                       }
+               }
        }
 
        /*
@@ -485,19 +634,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                if (ret)
                        return ret;
 
-               if (unlikely(trans->journal_transaction_names))
-                       journal_transaction_name(trans);
+               journal_transaction_name(trans);
        } else {
                trans->journal_res.seq = c->journal.replay_journal_seq;
        }
 
-       if (unlikely(trans->extra_journal_entry_u64s)) {
+       if (unlikely(trans->extra_journal_entries.nr)) {
                memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-                                 trans->extra_journal_entries,
-                                 trans->extra_journal_entry_u64s);
+                                 trans->extra_journal_entries.data,
+                                 trans->extra_journal_entries.nr);
 
-               trans->journal_res.offset       += trans->extra_journal_entry_u64s;
-               trans->journal_res.u64s         -= trans->extra_journal_entry_u64s;
+               trans->journal_res.offset       += trans->extra_journal_entries.nr;
+               trans->journal_res.u64s         -= trans->extra_journal_entries.nr;
        }
 
        /*
@@ -520,110 +668,71 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
        trans_for_each_update(trans, i)
                if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-                       ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+                       ret = run_one_mem_trigger(trans, i, i->flags);
                        if (ret)
                                return ret;
                }
 
        if (unlikely(c->gc_pos.phase)) {
-               ret = bch2_trans_mark_gc(trans);
+               ret = bch2_trans_commit_run_gc_triggers(trans);
                if  (ret)
                        return ret;
        }
 
-       trans_for_each_update(trans, i)
-               do_btree_insert_one(trans, i);
-
-       return ret;
-}
-
-static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
-       unsigned l;
-
-       for (l = 0; l < BTREE_MAX_DEPTH; l++)
-               if (btree_node_read_locked(path, l))
-                       BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
-}
-
-static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
-       struct btree *b = path_l(path)->b;
-
-       do {
-               if (path->nodes_locked &&
-                   path->nodes_locked != path->nodes_intent_locked)
-                       path_upgrade_readers(trans, path);
-       } while ((path = prev_btree_path(trans, path)) &&
-                path_l(path)->b == b);
-}
-
-/*
- * Check for nodes that we have both read and intent locks on, and upgrade the
- * readers to intent:
- */
-static inline void normalize_read_intent_locks(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i, nr_read = 0, nr_intent = 0;
-
-       trans_for_each_path_inorder(trans, path, i) {
-               struct btree_path *next = i + 1 < trans->nr_sorted
-                       ? trans->paths + trans->sorted[i + 1]
-                       : NULL;
-
-               if (path->nodes_locked) {
-                       if (path->nodes_intent_locked)
-                               nr_intent++;
-                       else
-                               nr_read++;
+       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+               trans_for_each_update(trans, i) {
+                       struct journal *j = &c->journal;
+                       struct jset_entry *entry;
+
+                       if (i->key_cache_already_flushed)
+                               continue;
+
+                       entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                              BCH_JSET_ENTRY_overwrite,
+                                              i->btree_id, i->level,
+                                              i->old_k.u64s);
+                       bkey_reassemble(&entry->start[0],
+                                       (struct bkey_s_c) { &i->old_k, i->old_v });
+
+                       entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                              BCH_JSET_ENTRY_btree_keys,
+                                              i->btree_id, i->level,
+                                              i->k->k.u64s);
+                       bkey_copy(&entry->start[0], i->k);
                }
 
-               if (!next || path_l(path)->b != path_l(next)->b) {
-                       if (nr_read && nr_intent)
-                               upgrade_readers(trans, path);
-
-                       nr_read = nr_intent = 0;
-               }
+               if (trans->journal_seq)
+                       *trans->journal_seq = trans->journal_res.seq;
        }
 
-       bch2_trans_verify_locks(trans);
-}
-
-static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path_inorder(trans, path, i) {
-               //if (path == pos)
-               //      break;
-
-               if (path->nodes_locked != path->nodes_intent_locked &&
-                   !bch2_btree_path_upgrade(trans, path, path->level + 1))
-                       return true;
+       trans_for_each_update(trans, i) {
+               i->k->k.needs_whiteout = false;
+
+               if (!i->cached)
+                       btree_insert_key_leaf(trans, i);
+               else if (!i->key_cache_already_flushed)
+                       bch2_btree_insert_key_cached(trans, i->path, i->k);
+               else {
+                       bch2_btree_key_cache_drop(trans, i->path);
+                       btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+               }
        }
 
-       return false;
+       return ret;
 }
 
 static inline int trans_lock_write(struct btree_trans *trans)
 {
        struct btree_insert_entry *i;
+       int ret;
 
        trans_for_each_update(trans, i) {
                if (same_leaf_as_prev(trans, i))
                        continue;
 
-               if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
-                       if (have_conflicting_read_lock(trans, i->path))
-                               goto fail;
-
-                       btree_node_lock_type(trans, i->path,
-                                            insert_l(i)->b,
-                                            i->path->pos, i->level,
-                                            SIX_LOCK_write, NULL, NULL);
-               }
+               ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
+               if (ret)
+                       goto fail;
 
                bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
        }
@@ -637,8 +746,8 @@ fail:
                bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
        }
 
-       trace_trans_restart_would_deadlock_write(trans->fn);
-       return btree_trans_restart(trans);
+       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 }
 
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@@ -658,40 +767,40 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       struct bkey_s_c old;
+       struct printbuf buf = PRINTBUF;
        int ret, u64s_delta = 0;
+       int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
        trans_for_each_update(trans, i) {
-               const char *invalid = bch2_bkey_invalid(c,
-                               bkey_i_to_s_c(i->k), i->bkey_type);
-               if (invalid) {
-                       char buf[200];
-
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-                       bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
-                                           buf, trans->fn, (void *) i->ip_allocated, invalid);
+               if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+                                     i->bkey_type, rw, &buf)) {
+                       printbuf_reset(&buf);
+                       prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
+                              trans->fn, (void *) i->ip_allocated);
+                       prt_newline(&buf);
+                       printbuf_indent_add(&buf, 2);
+
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+                       prt_newline(&buf);
+
+                       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+                                         i->bkey_type, rw, &buf);
+
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       printbuf_exit(&buf);
                        return -EINVAL;
                }
                btree_insert_entry_checks(trans, i);
        }
 
-       trans_for_each_update(trans, i) {
-               struct bkey u;
+       printbuf_exit(&buf);
 
-               /*
-                * peek_slot() doesn't yet work on iterators that point to
-                * interior nodes:
-                */
-               if (i->cached || i->level)
+       trans_for_each_update(trans, i) {
+               if (i->cached)
                        continue;
 
-               old = bch2_btree_path_peek_slot(i->path, &u);
-               ret = bkey_err(old);
-               if (unlikely(ret))
-                       return ret;
-
                u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+               u64s_delta -= i->old_btree_u64s;
 
                if (!same_leaf_as_next(trans, i)) {
                        if (u64s_delta <= 0) {
@@ -708,16 +817,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
                        JOURNAL_RES_GET_NONBLOCK|
-                       ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-                        ? JOURNAL_RES_GET_RESERVED : 0));
+                       (trans->flags & JOURNAL_WATERMARK_MASK));
        if (unlikely(ret == -EAGAIN))
                ret = bch2_trans_journal_preres_get_cold(trans,
                                                trans->journal_preres_u64s, trace_ip);
        if (unlikely(ret))
                return ret;
 
-       normalize_read_intent_locks(trans);
-
        ret = trans_lock_write(trans);
        if (unlikely(ret))
                return ret;
@@ -770,12 +876,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        switch (ret) {
        case BTREE_INSERT_BTREE_NODE_FULL:
                ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
-               if (!ret)
-                       return 0;
-
-               if (ret == -EINTR)
-                       trace_trans_restart_btree_node_split(trans->fn, trace_ip,
-                                               i->btree_id, &i->path->pos);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
                break;
        case BTREE_INSERT_NEED_MARK_REPLICAS:
                bch2_trans_unlock(trans);
@@ -784,19 +886,16 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (ret)
                        break;
 
-               if (bch2_trans_relock(trans))
-                       return 0;
-
-               trace_trans_restart_mark_replicas(trans->fn, trace_ip);
-               ret = -EINTR;
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
                break;
        case BTREE_INSERT_NEED_JOURNAL_RES:
                bch2_trans_unlock(trans);
 
                if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-                   !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
-                       trans->restarted = true;
-                       ret = -EAGAIN;
+                   !(trans->flags & JOURNAL_WATERMARK_reserved)) {
+                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        break;
                }
 
@@ -804,37 +903,35 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (ret)
                        break;
 
-               if (bch2_trans_relock(trans))
-                       return 0;
-
-               trace_trans_restart_journal_res_get(trans->fn, trace_ip);
-               ret = -EINTR;
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
                break;
        case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
                bch2_trans_unlock(trans);
 
-               trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
+               trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
 
                wait_event_freezable(c->journal.reclaim_wait,
                                     (ret = journal_reclaim_wait_done(c)));
                if (ret < 0)
                        break;
 
-               if (bch2_trans_relock(trans))
-                       return 0;
-
-               trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
-               ret = -EINTR;
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
                break;
        default:
                BUG_ON(ret >= 0);
                break;
        }
 
-       BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
-       BUG_ON(ret == -ENOSPC &&
-              !(trans->flags & BTREE_INSERT_NOWAIT) &&
-              (trans->flags & BTREE_INSERT_NOFAIL));
+       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+                               !(trans->flags & BTREE_INSERT_NOWAIT) &&
+                               (trans->flags & BTREE_INSERT_NOFAIL), c,
+               "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
 
        return ret;
 }
@@ -851,126 +948,34 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 
        bch2_trans_unlock(trans);
 
-       ret = bch2_fs_read_write_early(c);
+       ret =   bch2_fs_read_write_early(c) ?:
+               bch2_trans_relock(trans);
        if (ret)
                return ret;
 
-       if (!bch2_trans_relock(trans))
-               return -EINTR;
-
        percpu_ref_get(&c->writes);
        return 0;
 }
 
-static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-                          bool overwrite)
-{
-       struct bkey             _deleted = KEY(0, 0, 0);
-       struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
-       struct bkey_s_c         old;
-       struct bkey             unpacked;
-       int ret = 0;
-
-       if ((i->flags & BTREE_TRIGGER_NORUN) ||
-           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-               return 0;
-
-       if (!overwrite) {
-               if (i->insert_trigger_run)
-                       return 0;
-
-               BUG_ON(i->overwrite_trigger_run);
-               i->insert_trigger_run = true;
-       } else {
-               if (i->overwrite_trigger_run)
-                       return 0;
-
-               BUG_ON(!i->insert_trigger_run);
-               i->overwrite_trigger_run = true;
-       }
-
-       old = bch2_btree_path_peek_slot(i->path, &unpacked);
-       _deleted.p = i->path->pos;
-
-       if (overwrite) {
-               ret = bch2_trans_mark_key(trans, old, deleted,
-                               BTREE_TRIGGER_OVERWRITE|i->flags);
-       } else if (old.k->type == i->k->k.type &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               i->overwrite_trigger_run = true;
-               ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
-                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
-       } else {
-               ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
-                               BTREE_TRIGGER_INSERT|i->flags);
-       }
-
-       if (ret == -EINTR)
-               trace_trans_restart_mark(trans->fn, _RET_IP_,
-                                        i->btree_id, &i->path->pos);
-       return ret ?: 1;
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-                             struct btree_insert_entry *btree_id_start)
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 {
+       struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       bool trans_trigger_run;
-       int ret, overwrite;
-
-       for (overwrite = 0; overwrite < 2; overwrite++) {
-
-               /*
-                * Running triggers will append more updates to the list of updates as
-                * we're walking it:
-                */
-               do {
-                       trans_trigger_run = false;
-
-                       for (i = btree_id_start;
-                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-                            i++) {
-                               ret = run_one_trigger(trans, i, overwrite);
-                               if (ret < 0)
-                                       return ret;
-                               if (ret)
-                                       trans_trigger_run = true;
-                       }
-               } while (trans_trigger_run);
-       }
-
-       return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-       unsigned btree_id = 0;
        int ret = 0;
 
-       /*
-        *
-        * For a given btree, this algorithm runs insert triggers before
-        * overwrite triggers: this is so that when extents are being moved
-        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-        * they are re-added.
-        */
-       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-               while (btree_id_start < trans->updates + trans->nr_updates &&
-                      btree_id_start->btree_id < btree_id)
-                       btree_id_start++;
-
-               ret = run_btree_triggers(trans, btree_id, btree_id_start);
+       trans_for_each_update(trans, i) {
+               ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
                if (ret)
-                       return ret;
+                       break;
        }
 
-       trans_for_each_update(trans, i)
-               BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-                      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
-       return 0;
+       return ret;
 }
 
 int __bch2_trans_commit(struct btree_trans *trans)
@@ -981,62 +986,59 @@ int __bch2_trans_commit(struct btree_trans *trans)
        int ret = 0;
 
        if (!trans->nr_updates &&
-           !trans->extra_journal_entry_u64s)
+           !trans->extra_journal_entries.nr)
                goto out_reset;
 
        if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
                lockdep_assert_held(&c->gc_lock);
 
-       memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
-       trans->journal_u64s             = trans->extra_journal_entry_u64s;
-       trans->journal_preres_u64s      = 0;
-
-       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+       ret = bch2_trans_commit_run_triggers(trans);
+       if (ret)
+               goto out_reset;
 
-       if (trans->journal_transaction_names)
-               trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+       if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+               ret = do_bch2_trans_commit_to_journal_replay(trans);
+               goto out_reset;
+       }
 
        if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-           unlikely(!percpu_ref_tryget(&c->writes))) {
+           unlikely(!percpu_ref_tryget_live(&c->writes))) {
                ret = bch2_trans_commit_get_rw_cold(trans);
                if (ret)
                        goto out_reset;
        }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       /*
-        * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
-        * from the key cache flush code:
-        */
-       trans_for_each_update(trans, i)
-               if (!i->cached &&
-                   !(i->flags & BTREE_TRIGGER_NORUN))
-                       bch2_btree_key_cache_verify_clean(trans,
-                                       i->btree_id, i->k->k.p);
-#endif
+       EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
-       ret = bch2_trans_commit_run_triggers(trans);
-       if (ret)
-               goto out;
+       memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+       trans->journal_u64s             = trans->extra_journal_entries.nr;
+       trans->journal_preres_u64s      = 0;
+
+       /* For journalling transaction name: */
+       trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
        trans_for_each_update(trans, i) {
                BUG_ON(!i->path->should_be_locked);
 
-               if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-                       trace_trans_restart_upgrade(trans->fn, _RET_IP_,
-                                                   i->btree_id, &i->path->pos);
-                       ret = btree_trans_restart(trans);
+               ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+               if (unlikely(ret))
                        goto out;
-               }
 
                BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
+               if (i->key_cache_already_flushed)
+                       continue;
+
+               /* we're going to journal the key being updated: */
                u64s = jset_u64s(i->k->k.u64s);
                if (i->cached &&
                    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
                        trans->journal_preres_u64s += u64s;
                trans->journal_u64s += u64s;
+
+               /* and we're also going to log the overwrite: */
+               trans->journal_u64s += jset_u64s(i->old_k.u64s);
        }
 
        if (trans->extra_journal_res) {
@@ -1058,24 +1060,20 @@ retry:
 
        if (ret)
                goto err;
+
+       trace_and_count(c, transaction_commit, trans, _RET_IP_);
 out:
        bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
        if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
                percpu_ref_put(&c->writes);
 out_reset:
-       trans_for_each_update(trans, i)
-               bch2_path_put(trans, i->path, true);
-
-       trans->extra_journal_res        = 0;
-       trans->nr_updates               = 0;
-       trans->hooks                    = NULL;
-       trans->extra_journal_entries    = NULL;
-       trans->extra_journal_entry_u64s = 0;
+       bch2_trans_reset_updates(trans);
 
        if (trans->fs_usage_deltas) {
                trans->fs_usage_deltas->used = 0;
-               memset(&trans->fs_usage_deltas->memset_start, 0,
+               memset((void *) trans->fs_usage_deltas +
+                      offsetof(struct replicas_delta_list, memset_start), 0,
                       (void *) &trans->fs_usage_deltas->memset_end -
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
@@ -1089,7 +1087,7 @@ err:
        goto retry;
 }
 
-static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
                                          enum btree_id id,
                                          struct bpos pos)
 {
@@ -1098,12 +1096,6 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
-       if (!btree_type_has_snapshots(id))
-               return 0;
-
-       if (!snapshot_t(c, pos.snapshot)->children[0])
-               return 0;
-
        bch2_trans_iter_init(trans, &iter, id, pos,
                             BTREE_ITER_NOT_EXTENTS|
                             BTREE_ITER_ALL_SNAPSHOTS);
@@ -1129,6 +1121,18 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
        return ret;
 }
 
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
+{
+       if (!btree_type_has_snapshots(id) ||
+           pos.snapshot == U32_MAX ||
+           !snapshot_t(trans->c, pos.snapshot)->children[0])
+               return 0;
+
+       return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
 int bch2_trans_update_extent(struct btree_trans *trans,
                             struct btree_iter *orig_iter,
                             struct bkey_i *insert,
@@ -1146,7 +1150,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                             BTREE_ITER_INTENT|
                             BTREE_ITER_WITH_UPDATES|
                             BTREE_ITER_NOT_EXTENTS);
-       k = bch2_btree_iter_peek(&iter);
+       k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
        if ((ret = bkey_err(k)))
                goto err;
        if (!k.k)
@@ -1298,7 +1302,8 @@ nomerge1:
                        goto out;
                }
 next:
-               k = bch2_btree_iter_next(&iter);
+               bch2_btree_iter_advance(&iter);
+               k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
                if ((ret = bkey_err(k)))
                        goto err;
                if (!k.k)
@@ -1376,9 +1381,42 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 }
 
 static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                         struct bkey_i *k, enum btree_update_flags flags)
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+                               struct bkey_i *k, enum btree_update_flags flags,
+                               unsigned long ip);
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+                                           struct btree_path *path,
+                                           struct btree_insert_entry *i,
+                                           enum btree_update_flags flags,
+                                           unsigned long ip)
 {
+       struct btree_path *btree_path;
+       int ret;
+
+       i->key_cache_already_flushed = true;
+       i->flags |= BTREE_TRIGGER_NORUN;
+
+       btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                  BTREE_ITER_INTENT, _THIS_IP_);
+
+       ret = bch2_btree_path_traverse(trans, btree_path, 0);
+       if (ret)
+               goto err;
+
+       btree_path_set_should_be_locked(btree_path);
+       ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
+err:
+       bch2_path_put(trans, btree_path, true);
+       return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+                               struct bkey_i *k, enum btree_update_flags flags,
+                               unsigned long ip)
+{
+       struct bch_fs *c = trans->c;
        struct btree_insert_entry *i, n;
 
        BUG_ON(!path->should_be_locked);
@@ -1394,7 +1432,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
                .cached         = path->cached,
                .path           = path,
                .k              = k,
-               .ip_allocated   = _RET_IP_,
+               .ip_allocated   = ip,
        };
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -1416,15 +1454,51 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
                BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
                bch2_path_put(trans, i->path, true);
-               *i = n;
-       } else
+               i->flags        = n.flags;
+               i->cached       = n.cached;
+               i->k            = n.k;
+               i->path         = n.path;
+               i->ip_allocated = n.ip_allocated;
+       } else {
                array_insert_item(trans->updates, trans->nr_updates,
                                  i - trans->updates, n);
 
-       __btree_path_get(n.path, true);
+               i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+               if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+                       struct bkey_i *j_k =
+                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+                       if (j_k) {
+                               i->old_k = j_k->k;
+                               i->old_v = &j_k->v;
+                       }
+               }
+       }
+
+       __btree_path_get(i->path, true);
+
+       /*
+        * If a key is present in the key cache, it must also exist in the
+        * btree - this is necessary for cache coherency. When iterating over
+        * a btree that's cached in the key cache, the btree iter code checks
+        * the key cache - but the key has to exist in the btree for that to
+        * work:
+        */
+       if (unlikely(path->cached && bkey_deleted(&i->old_k)))
+               return flush_new_cached_update(trans, path, i, flags, ip);
+
        return 0;
 }
 
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags)
+{
+       return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
+}
+
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                                   struct bkey_i *k, enum btree_update_flags flags)
 {
@@ -1446,6 +1520,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                        k->k.type = KEY_TYPE_whiteout;
        }
 
+       /*
+        * Ensure that updates to cached btrees go to the key cache:
+        */
        if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
            !path->cached &&
            !path->level &&
@@ -1465,20 +1542,18 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                                                        _THIS_IP_);
 
                        ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-                                                      BTREE_ITER_CACHED|
-                                                      BTREE_ITER_CACHED_NOFILL);
+                                                      BTREE_ITER_CACHED);
                        if (unlikely(ret))
                                return ret;
 
                        ck = (void *) iter->key_cache_path->l[0].b;
 
                        if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                               trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
-                               btree_trans_restart(trans);
-                               return -EINTR;
+                               trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+                               return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
                        }
 
-                       iter->key_cache_path->should_be_locked = true;
+                       btree_path_set_should_be_locked(iter->key_cache_path);
                }
 
                path = iter->key_cache_path;
@@ -1524,8 +1599,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
                             __bch2_btree_insert(&trans, id, k));
 }
 
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned update_flags)
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+                               unsigned len, unsigned update_flags)
 {
        struct bkey_i *k;
 
@@ -1535,28 +1610,39 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
        bkey_init(&k->k);
        k->k.p = iter->pos;
+       bch2_key_resize(&k->k, len);
        return bch2_trans_update(trans, iter, k, update_flags);
 }
 
+int bch2_btree_delete_at(struct btree_trans *trans,
+                        struct btree_iter *iter, unsigned update_flags)
+{
+       return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
-                                 unsigned iter_flags,
+                                 unsigned update_flags,
                                  u64 *journal_seq)
 {
+       u32 restart_count = trans->restart_count;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
-retry:
-       while ((bch2_trans_begin(trans),
-              (k = bch2_btree_iter_peek(&iter)).k) &&
-              !(ret = bkey_err(k)) &&
-              bkey_cmp(iter.pos, end) < 0) {
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+       while ((k = bch2_btree_iter_peek(&iter)).k) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(trans->c, 0);
                struct bkey_i delete;
 
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (bkey_cmp(iter.pos, end) >= 0)
+                       break;
+
                bkey_init(&delete.k);
 
                /*
@@ -1585,23 +1671,31 @@ retry:
 
                        ret = bch2_extent_trim_atomic(trans, &iter, &delete);
                        if (ret)
-                               break;
+                               goto err;
                }
 
-               ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
+               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
                        bch2_trans_commit(trans, &disk_res, journal_seq,
-                                       BTREE_INSERT_NOFAIL);
+                                         BTREE_INSERT_NOFAIL);
                bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+               /*
+                * the bch2_trans_begin() call is in a weird place because we
+                * need to call it after every transaction commit, to avoid path
+                * overflow, but don't want to call it if the delete operation
+                * is a no-op and we have no work to do:
+                */
+               bch2_trans_begin(trans);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       ret = 0;
                if (ret)
                        break;
        }
-
-       if (ret == -EINTR) {
-               ret = 0;
-               goto retry;
-       }
-
        bch2_trans_iter_exit(trans, &iter);
+
+       if (!ret && trans_was_restarted(trans, restart_count))
+               ret = -BCH_ERR_transaction_restart_nested;
        return ret;
 }
 
@@ -1612,10 +1706,40 @@ retry:
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            struct bpos start, struct bpos end,
-                           unsigned iter_flags,
+                           unsigned update_flags,
                            u64 *journal_seq)
 {
-       return bch2_trans_do(c, NULL, journal_seq, 0,
-                            bch2_btree_delete_range_trans(&trans, id, start, end,
-                                                          iter_flags, journal_seq));
+       int ret = bch2_trans_run(c,
+                       bch2_btree_delete_range_trans(&trans, id, start, end,
+                                                     update_flags, journal_seq));
+       if (ret == -BCH_ERR_transaction_restart_nested)
+               ret = 0;
+       return ret;
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
+{
+       unsigned len = strlen(msg);
+       unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+       struct jset_entry_log *l;
+       int ret;
+
+       ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
+       if (ret)
+               return ret;
+
+       l = (void *) &darray_top(trans->extra_journal_entries);
+       l->entry.u64s           = cpu_to_le16(u64s);
+       l->entry.btree_id       = 0;
+       l->entry.level          = 1;
+       l->entry.type           = BCH_JSET_ENTRY_log;
+       l->entry.pad[0]         = 0;
+       l->entry.pad[1]         = 0;
+       l->entry.pad[2]         = 0;
+       memcpy(l->d, msg, len);
+       while (len & 7)
+               l->d[len++] = '\0';
+
+       trans->extra_journal_entries.nr += jset_u64s(u64s);
+       return 0;
 }
index eb0eaa983dc9f665c3a0c384f49c3a1b1f605160..116711fc01fb30f501ad206fb9c6a7f70362ca3f 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "backpointers.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -88,20 +89,17 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
                            : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
        struct bch_fs *c = ca->fs;
-       struct bch_dev_usage ret;
        unsigned seq, i, u64s = dev_usage_u64s();
 
        do {
                seq = read_seqcount_begin(&c->usage_lock);
-               memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+               memcpy(usage, ca->usage_base, u64s * sizeof(u64));
                for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
-                       acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+                       acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
        } while (read_seqcount_retry(&c->usage_lock, seq));
-
-       return ret;
 }
 
 static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -197,26 +195,26 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 {
        unsigned i;
 
-       pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+       prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
 
-       pr_buf(out, "hidden:\t\t\t\t%llu\n",
+       prt_printf(out, "hidden:\t\t\t\t%llu\n",
               fs_usage->u.hidden);
-       pr_buf(out, "data:\t\t\t\t%llu\n",
+       prt_printf(out, "data:\t\t\t\t%llu\n",
               fs_usage->u.data);
-       pr_buf(out, "cached:\t\t\t\t%llu\n",
+       prt_printf(out, "cached:\t\t\t\t%llu\n",
               fs_usage->u.cached);
-       pr_buf(out, "reserved:\t\t\t%llu\n",
+       prt_printf(out, "reserved:\t\t\t%llu\n",
               fs_usage->u.reserved);
-       pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+       prt_printf(out, "nr_inodes:\t\t\t%llu\n",
               fs_usage->u.nr_inodes);
-       pr_buf(out, "online reserved:\t\t%llu\n",
+       prt_printf(out, "online reserved:\t\t%llu\n",
               fs_usage->online_reserved);
 
        for (i = 0;
             i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
             i++) {
-               pr_buf(out, "%u replicas:\n", i + 1);
-               pr_buf(out, "\treserved:\t\t%llu\n",
+               prt_printf(out, "%u replicas:\n", i + 1);
+               prt_printf(out, "\treserved:\t\t%llu\n",
                       fs_usage->u.persistent_reserved[i]);
        }
 
@@ -224,9 +222,9 @@ void bch2_fs_usage_to_text(struct printbuf *out,
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
 
-               pr_buf(out, "\t");
+               prt_printf(out, "\t");
                bch2_replicas_entry_to_text(out, e);
-               pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+               prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
        }
 }
 
@@ -279,44 +277,22 @@ bch2_fs_usage_read_short(struct bch_fs *c)
        return ret;
 }
 
-static inline int is_unavailable_bucket(struct bucket_mark m)
+void bch2_dev_usage_init(struct bch_dev *ca)
 {
-       return !is_available_bucket(m);
+       ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
 }
 
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
-                                           struct bucket_mark m)
+                                           struct bch_alloc_v4 a)
 {
-       return m.dirty_sectors
-               ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
+       return a.dirty_sectors
+               ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
                : 0;
 }
 
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
-       return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
-{
-       return m.cached_sectors && !m.dirty_sectors
-               ? BCH_DATA_cached
-               : m.data_type;
-}
-
-static inline void account_bucket(struct bch_fs_usage *fs_usage,
-                                 struct bch_dev_usage *dev_usage,
-                                 enum bch_data_type type,
-                                 int nr, s64 size)
-{
-       if (type == BCH_DATA_sb || type == BCH_DATA_journal)
-               fs_usage->hidden        += size;
-
-       dev_usage->d[type].buckets      += nr;
-}
-
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-                                 struct bucket_mark old, struct bucket_mark new,
+                                 struct bch_alloc_v4 old,
+                                 struct bch_alloc_v4 new,
                                  u64 journal_seq, bool gc)
 {
        struct bch_fs_usage *fs_usage;
@@ -324,32 +300,52 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
        preempt_disable();
        fs_usage = fs_usage_ptr(c, journal_seq, gc);
-       u = dev_usage_ptr(ca, journal_seq, gc);
 
-       if (bucket_type(old))
-               account_bucket(fs_usage, u, bucket_type(old),
-                              -1, -ca->mi.bucket_size);
+       if (data_type_is_hidden(old.data_type))
+               fs_usage->hidden -= ca->mi.bucket_size;
+       if (data_type_is_hidden(new.data_type))
+               fs_usage->hidden += ca->mi.bucket_size;
 
-       if (bucket_type(new))
-               account_bucket(fs_usage, u, bucket_type(new),
-                              1, ca->mi.bucket_size);
+       u = dev_usage_ptr(ca, journal_seq, gc);
 
-       u->buckets_ec += (int) new.stripe - (int) old.stripe;
-       u->buckets_unavailable +=
-               is_unavailable_bucket(new) - is_unavailable_bucket(old);
+       u->d[old.data_type].buckets--;
+       u->d[new.data_type].buckets++;
+
+       u->buckets_ec -= (int) !!old.stripe;
+       u->buckets_ec += (int) !!new.stripe;
 
        u->d[old.data_type].sectors -= old.dirty_sectors;
        u->d[new.data_type].sectors += new.dirty_sectors;
-       u->d[BCH_DATA_cached].sectors +=
-               (int) new.cached_sectors - (int) old.cached_sectors;
+
+       u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+       u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
 
        u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
        u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
 
        preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+                                   struct bucket old, struct bucket new,
+                                   u64 journal_seq, bool gc)
+{
+       struct bch_alloc_v4 old_a = {
+               .gen            = old.gen,
+               .data_type      = old.data_type,
+               .dirty_sectors  = old.dirty_sectors,
+               .cached_sectors = old.cached_sectors,
+               .stripe         = old.stripe,
+       };
+       struct bch_alloc_v4 new_a = {
+               .gen            = new.gen,
+               .data_type      = new.data_type,
+               .dirty_sectors  = new.dirty_sectors,
+               .cached_sectors = new.cached_sectors,
+               .stripe         = new.stripe,
+       };
 
-       if (!is_available_bucket(old) && is_available_bucket(new))
-               bch2_wake_allocator(ca);
+       bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 }
 
 static inline int __update_replicas(struct bch_fs *c,
@@ -373,22 +369,22 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 {
        struct bch_fs_usage __percpu *fs_usage;
        int idx, ret = 0;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
 
        percpu_down_read(&c->mark_lock);
+       buf.atomic++;
 
        idx = bch2_replicas_entry_idx(c, r);
        if (idx < 0 &&
-           (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-            fsck_err(c, "no replicas entry\n"
-                     "  while marking %s",
-                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+           fsck_err(c, "no replicas entry\n"
+                    "  while marking %s",
+                    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                percpu_up_read(&c->mark_lock);
                ret = bch2_mark_replicas(c, r);
-               if (ret)
-                       return ret;
-
                percpu_down_read(&c->mark_lock);
+
+               if (ret)
+                       goto err;
                idx = bch2_replicas_entry_idx(c, r);
        }
        if (idx < 0) {
@@ -404,6 +400,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 err:
 fsck_err:
        percpu_up_read(&c->mark_lock);
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -467,7 +464,8 @@ static inline void update_replicas_list(struct btree_trans *trans,
 
        n = (void *) d->d + d->used;
        n->delta = sectors;
-       memcpy(&n->r, r, replicas_entry_bytes(r));
+       memcpy((void *) n + offsetof(struct replicas_delta, r),
+              r, replicas_entry_bytes(r));
        bch2_replicas_entry_sort(&n->r);
        d->used += b;
 }
@@ -482,31 +480,15 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
        update_replicas_list(trans, &r.e, sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, bool owned_by_allocator)
-{
-       struct bucket *g = bucket(ca, b);
-       struct bucket_mark old, new;
-
-       old = bucket_cmpxchg(g, new, ({
-               new.owned_by_allocator  = owned_by_allocator;
-       }));
-
-       BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
-static int bch2_mark_alloc(struct btree_trans *trans,
-                          struct bkey_s_c old, struct bkey_s_c new,
-                          unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+                   struct bkey_s_c old, struct bkey_s_c new,
+                   unsigned flags)
 {
        bool gc = flags & BTREE_TRIGGER_GC;
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
-       struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
+       struct bch_alloc_v4 old_a, new_a;
        struct bch_dev *ca;
-       struct bucket *g;
-       struct bucket_mark old_m, m;
        int ret = 0;
 
        /*
@@ -516,11 +498,20 @@ static int bch2_mark_alloc(struct btree_trans *trans,
            !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
                return 0;
 
+       if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+                                      "alloc key for invalid device or bucket"))
+               return -EIO;
+
+       ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+       bch2_alloc_to_v4(old, &old_a);
+       bch2_alloc_to_v4(new, &new_a);
+
        if ((flags & BTREE_TRIGGER_INSERT) &&
-           !old_u.data_type != !new_u.data_type &&
-           new.k->type == KEY_TYPE_alloc_v3) {
-               struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
-               u64 old_journal_seq = le64_to_cpu(v->journal_seq);
+           data_type_is_empty(old_a.data_type) !=
+           data_type_is_empty(new_a.data_type) &&
+           new.k->type == KEY_TYPE_alloc_v4) {
+               struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
 
                BUG_ON(!journal_seq);
 
@@ -529,18 +520,20 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                 * before the bucket became empty again, then the we don't have
                 * to wait on a journal flush before we can reuse the bucket:
                 */
-               new_u.journal_seq = !new_u.data_type &&
-                       (journal_seq == old_journal_seq ||
-                        bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+               new_a.journal_seq = data_type_is_empty(new_a.data_type) &&
+                       (journal_seq == v->journal_seq ||
+                        bch2_journal_noflush_seq(&c->journal, v->journal_seq))
                        ? 0 : journal_seq;
-               v->journal_seq = cpu_to_le64(new_u.journal_seq);
+               v->journal_seq = new_a.journal_seq;
        }
 
-       if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+       if (!data_type_is_empty(old_a.data_type) &&
+           data_type_is_empty(new_a.data_type) &&
+           new_a.journal_seq) {
                ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
                                c->journal.flushed_seq_ondisk,
-                               new_u.dev, new_u.bucket,
-                               new_u.journal_seq);
+                               new.k->p.inode, new.k->p.offset,
+                               new_a.journal_seq);
                if (ret) {
                        bch2_fs_fatal_error(c,
                                "error setting bucket_needs_journal_commit: %i", ret);
@@ -548,33 +541,27 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                }
        }
 
-       ca = bch_dev_bkey_exists(c, new_u.dev);
+       percpu_down_read(&c->mark_lock);
+       if (!gc && new_a.gen != old_a.gen)
+               *bucket_gen(ca, new.k->p.offset) = new_a.gen;
 
-       if (new_u.bucket >= ca->mi.nbuckets)
-               return 0;
+       bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 
-       percpu_down_read(&c->mark_lock);
-       if (!gc && new_u.gen != old_u.gen)
-               *bucket_gen(ca, new_u.bucket) = new_u.gen;
-
-       g = __bucket(ca, new_u.bucket, gc);
-
-       old_m = bucket_cmpxchg(g, m, ({
-               m.gen                   = new_u.gen;
-               m.data_type             = new_u.data_type;
-               m.dirty_sectors         = new_u.dirty_sectors;
-               m.cached_sectors        = new_u.cached_sectors;
-               m.stripe                = new_u.stripe != 0;
-       }));
-
-       bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
-
-       g->io_time[READ]        = new_u.read_time;
-       g->io_time[WRITE]       = new_u.write_time;
-       g->oldest_gen           = new_u.oldest_gen;
-       g->gen_valid            = 1;
-       g->stripe               = new_u.stripe;
-       g->stripe_redundancy    = new_u.stripe_redundancy;
+       if (gc) {
+               struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+               bucket_lock(g);
+
+               g->gen_valid            = 1;
+               g->gen                  = new_a.gen;
+               g->data_type            = new_a.data_type;
+               g->stripe               = new_a.stripe;
+               g->stripe_redundancy    = new_a.stripe_redundancy;
+               g->dirty_sectors        = new_a.dirty_sectors;
+               g->cached_sectors       = new_a.cached_sectors;
+
+               bucket_unlock(g);
+       }
        percpu_up_read(&c->mark_lock);
 
        /*
@@ -583,40 +570,42 @@ static int bch2_mark_alloc(struct btree_trans *trans,
         */
 
        if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-           old_m.cached_sectors) {
+           old_a.cached_sectors) {
                ret = update_cached_sectors(c, new, ca->dev_idx,
-                                           -old_m.cached_sectors,
+                                           -((s64) old_a.cached_sectors),
                                            journal_seq, gc);
                if (ret) {
                        bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
                        return ret;
                }
-
-               trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
-                                old_m.cached_sectors);
        }
 
+       if (new_a.data_type == BCH_DATA_free &&
+           (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+               closure_wake_up(&c->freelist_wait);
+
+       if (new_a.data_type == BCH_DATA_need_discard &&
+           (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+               bch2_do_discards(c);
+
+       if (old_a.data_type != BCH_DATA_cached &&
+           new_a.data_type == BCH_DATA_cached &&
+           should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+               bch2_do_invalidates(c);
+
+       if (new_a.data_type == BCH_DATA_need_gc_gens)
+               bch2_do_gc_gens(c);
+
        return 0;
 }
 
-#define checked_add(a, b)                                      \
-({                                                             \
-       unsigned _res = (unsigned) (a) + (b);                   \
-       bool overflow = _res > U16_MAX;                         \
-       if (overflow)                                           \
-               _res = U16_MAX;                                 \
-       (a) = _res;                                             \
-       overflow;                                               \
-})
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-                              size_t b, enum bch_data_type data_type,
-                              unsigned sectors, struct gc_pos pos,
-                              unsigned flags)
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                             size_t b, enum bch_data_type data_type,
+                             unsigned sectors, struct gc_pos pos,
+                             unsigned flags)
 {
-       struct bucket *g;
-       struct bucket_mark old, new;
-       bool overflow;
+       struct bucket old, new, *g;
+       int ret = 0;
 
        BUG_ON(!(flags & BTREE_TRIGGER_GC));
        BUG_ON(data_type != BCH_DATA_sb &&
@@ -626,40 +615,42 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
         * Backup superblock might be past the end of our normal usable space:
         */
        if (b >= ca->mi.nbuckets)
-               return;
+               return 0;
 
        percpu_down_read(&c->mark_lock);
        g = gc_bucket(ca, b);
-       old = bucket_cmpxchg(g, new, ({
-               new.data_type   = data_type;
-               overflow = checked_add(new.dirty_sectors, sectors);
-       }));
-
-       bch2_fs_inconsistent_on(old.data_type &&
-                               old.data_type != data_type, c,
-               "different types of data in same bucket: %s, %s",
-               bch2_data_types[old.data_type],
-               bch2_data_types[data_type]);
-
-       bch2_fs_inconsistent_on(overflow, c,
-               "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
-               ca->dev_idx, b, new.gen,
-               bch2_data_types[old.data_type ?: data_type],
-               old.dirty_sectors, sectors);
-
-       bch2_dev_usage_update(c, ca, old, new, 0, true);
-       percpu_up_read(&c->mark_lock);
-}
 
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-       EBUG_ON(sectors < 0);
+       bucket_lock(g);
+       old = *g;
+
+       if (bch2_fs_inconsistent_on(g->data_type &&
+                       g->data_type != data_type, c,
+                       "different types of data in same bucket: %s, %s",
+                       bch2_data_types[g->data_type],
+                       bch2_data_types[data_type])) {
+               ret = -EIO;
+               goto err;
+       }
+
+       if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+                       ca->dev_idx, b, g->gen,
+                       bch2_data_types[g->data_type ?: data_type],
+                       g->dirty_sectors, sectors)) {
+               ret = -EIO;
+               goto err;
+       }
+
 
-       return p.crc.compression_type &&
-               p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
-               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-                              p.crc.uncompressed_size)
-               : sectors;
+       g->data_type = data_type;
+       g->dirty_sectors += sectors;
+       new = *g;
+err:
+       bucket_unlock(g);
+       if (!ret)
+               bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+       percpu_up_read(&c->mark_lock);
+       return ret;
 }
 
 static int check_bucket_ref(struct bch_fs *c,
@@ -667,14 +658,22 @@ static int check_bucket_ref(struct bch_fs *c,
                            const struct bch_extent_ptr *ptr,
                            s64 sectors, enum bch_data_type ptr_data_type,
                            u8 b_gen, u8 bucket_data_type,
-                           u16 dirty_sectors, u16 cached_sectors)
+                           u32 dirty_sectors, u32 cached_sectors)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
        u16 bucket_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       if (bucket_data_type == BCH_DATA_cached)
+               bucket_data_type = BCH_DATA_user;
+
+       if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+           (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
+               bucket_data_type = ptr_data_type = BCH_DATA_stripe;
 
        if (gen_after(ptr->gen, b_gen)) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -683,8 +682,9 @@ static int check_bucket_ref(struct bch_fs *c,
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EIO;
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               ret = -EIO;
+               goto err;
        }
 
        if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
@@ -694,8 +694,10 @@ static int check_bucket_ref(struct bch_fs *c,
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EIO;
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               ret = -EIO;
+               goto err;
        }
 
        if (b_gen != ptr->gen && !ptr->cached) {
@@ -706,14 +708,19 @@ static int check_bucket_ref(struct bch_fs *c,
                        *bucket_gen(ca, bucket_nr),
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EIO;
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               ret = -EIO;
+               goto err;
        }
 
-       if (b_gen != ptr->gen)
-               return 1;
+       if (b_gen != ptr->gen) {
+               ret = 1;
+               goto err;
+       }
 
-       if (bucket_data_type && ptr_data_type &&
+       if (!data_type_is_empty(bucket_data_type) &&
+           ptr_data_type &&
            bucket_data_type != ptr_data_type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
@@ -721,22 +728,27 @@ static int check_bucket_ref(struct bch_fs *c,
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type],
                        bch2_data_types[ptr_data_type],
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EIO;
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               ret = -EIO;
+               goto err;
        }
 
-       if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+       if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        bucket_sectors, sectors,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EIO;
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               ret = -EIO;
+               goto err;
        }
-
-       return 0;
+err:
+       printbuf_exit(&buf);
+       return ret;
 }
 
 static int mark_stripe_bucket(struct btree_trans *trans,
@@ -753,9 +765,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
        s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
        const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bucket *g;
-       struct bucket_mark new, old;
-       char buf[200];
+       struct bucket old, new, *g;
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -763,40 +774,42 @@ static int mark_stripe_bucket(struct btree_trans *trans,
        /* * XXX doesn't handle deletion */
 
        percpu_down_read(&c->mark_lock);
+       buf.atomic++;
        g = PTR_GC_BUCKET(ca, ptr);
 
-       if (g->mark.dirty_sectors ||
+       if (g->dirty_sectors ||
            (g->stripe && g->stripe != k.k->p.offset)) {
                bch2_fs_inconsistent(c,
                              "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-                             ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
-                             (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+                             ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+                             (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
                ret = -EINVAL;
                goto err;
        }
 
-       old = bucket_cmpxchg(g, new, ({
-               ret = check_bucket_ref(c, k, ptr, sectors, data_type,
-                                      new.gen, new.data_type,
-                                      new.dirty_sectors, new.cached_sectors);
-               if (ret)
-                       goto err;
+       bucket_lock(g);
+       old = *g;
 
-               new.dirty_sectors += sectors;
-               if (data_type)
-                       new.data_type           = data_type;
+       ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+                              g->gen, g->data_type,
+                              g->dirty_sectors, g->cached_sectors);
+       if (ret)
+               goto err;
 
-               new.stripe = true;
-       }));
+       if (data_type)
+               g->data_type = data_type;
+       g->dirty_sectors += sectors;
 
        g->stripe               = k.k->p.offset;
        g->stripe_redundancy    = s->nr_redundant;
-
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+       new = *g;
 err:
+       bucket_unlock(g);
+       if (!ret)
+               bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
        percpu_up_read(&c->mark_lock);
-
-       return 0;
+       printbuf_exit(&buf);
+       return ret;
 }
 
 static int __mark_pointer(struct btree_trans *trans,
@@ -804,9 +817,9 @@ static int __mark_pointer(struct btree_trans *trans,
                          const struct bch_extent_ptr *ptr,
                          s64 sectors, enum bch_data_type ptr_data_type,
                          u8 bucket_gen, u8 *bucket_data_type,
-                         u16 *dirty_sectors, u16 *cached_sectors)
+                         u32 *dirty_sectors, u32 *cached_sectors)
 {
-       u16 *dst_sectors = !ptr->cached
+       u32 *dst_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
        int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
@@ -830,43 +843,31 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 {
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bucket_mark old, new;
        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bucket *g;
+       struct bucket old, new, *g;
        u8 bucket_data_type;
-       u64 v;
        int ret = 0;
 
        BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
        percpu_down_read(&c->mark_lock);
        g = PTR_GC_BUCKET(ca, &p.ptr);
-
-       v = atomic64_read(&g->_mark.v);
-       do {
-               new.v.counter = old.v.counter = v;
-               bucket_data_type = new.data_type;
-
-               ret = __mark_pointer(trans, k, &p.ptr, sectors,
-                                    data_type, new.gen,
-                                    &bucket_data_type,
-                                    &new.dirty_sectors,
-                                    &new.cached_sectors);
-               if (ret)
-                       goto err;
-
-               new.data_type = bucket_data_type;
-
-               if (flags & BTREE_TRIGGER_NOATOMIC) {
-                       g->_mark = new;
-                       break;
-               }
-       } while ((v = atomic64_cmpxchg(&g->_mark.v,
-                             old.v.counter,
-                             new.v.counter)) != old.v.counter);
-
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
-err:
+       bucket_lock(g);
+       old = *g;
+
+       bucket_data_type = g->data_type;
+       ret = __mark_pointer(trans, k, &p.ptr, sectors,
+                            data_type, g->gen,
+                            &bucket_data_type,
+                            &g->dirty_sectors,
+                            &g->cached_sectors);
+       if (!ret)
+               g->data_type = bucket_data_type;
+
+       new = *g;
+       bucket_unlock(g);
+       if (!ret)
+               bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
        percpu_up_read(&c->mark_lock);
 
        return ret;
@@ -913,13 +914,13 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_mark_extent(struct btree_trans *trans,
-                           struct bkey_s_c old, struct bkey_s_c new,
-                           unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+                    struct bkey_s_c old, struct bkey_s_c new,
+                    unsigned flags)
 {
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -983,10 +984,11 @@ static int bch2_mark_extent(struct btree_trans *trans,
        if (r.e.nr_devs) {
                ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
                if (ret) {
-                       char buf[200];
+                       struct printbuf buf = PRINTBUF;
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
-                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+                       printbuf_exit(&buf);
                        return ret;
                }
        }
@@ -994,9 +996,9 @@ static int bch2_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_mark_stripe(struct btree_trans *trans,
-                           struct bkey_s_c old, struct bkey_s_c new,
-                           unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+                    struct bkey_s_c old, struct bkey_s_c new,
+                    unsigned flags)
 {
        bool gc = flags & BTREE_TRIGGER_GC;
        u64 journal_seq = trans->journal_res.seq;
@@ -1015,13 +1017,16 @@ static int bch2_mark_stripe(struct btree_trans *trans,
                struct stripe *m = genradix_ptr(&c->stripes, idx);
 
                if (!m || (old_s && !m->alive)) {
-                       char buf1[200], buf2[200];
+                       struct printbuf buf1 = PRINTBUF;
+                       struct printbuf buf2 = PRINTBUF;
 
-                       bch2_bkey_val_to_text(&PBUF(buf1), c, old);
-                       bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+                       bch2_bkey_val_to_text(&buf1, c, old);
+                       bch2_bkey_val_to_text(&buf2, c, new);
                        bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
                                            "old %s\n"
-                                           "new %s", idx, buf1, buf2);
+                                           "new %s", idx, buf1.buf, buf2.buf);
+                       printbuf_exit(&buf2);
+                       printbuf_exit(&buf1);
                        bch2_inconsistent_error(c);
                        return -1;
                }
@@ -1086,10 +1091,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
                                      ((s64) m->sectors * m->nr_redundant),
                                      journal_seq, gc);
                if (ret) {
-                       char buf[200];
+                       struct printbuf buf = PRINTBUF;
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, new);
-                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+                       bch2_bkey_val_to_text(&buf, c, new);
+                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+                       printbuf_exit(&buf);
                        return ret;
                }
        }
@@ -1097,19 +1103,19 @@ static int bch2_mark_stripe(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_mark_inode(struct btree_trans *trans,
-                          struct bkey_s_c old, struct bkey_s_c new,
-                          unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+                   struct bkey_s_c old, struct bkey_s_c new,
+                   unsigned flags)
 {
        struct bch_fs *c = trans->c;
        struct bch_fs_usage __percpu *fs_usage;
        u64 journal_seq = trans->journal_res.seq;
 
        if (flags & BTREE_TRIGGER_INSERT) {
-               struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+               struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
 
                BUG_ON(!journal_seq);
-               BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+               BUG_ON(new.k->type != KEY_TYPE_inode_v3);
 
                v->bi_journal_seq = cpu_to_le64(journal_seq);
        }
@@ -1128,12 +1134,12 @@ static int bch2_mark_inode(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_mark_reservation(struct btree_trans *trans,
-                                struct bkey_s_c old, struct bkey_s_c new,
-                                unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+                         struct bkey_s_c old, struct bkey_s_c new,
+                         unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bch_fs_usage __percpu *fs_usage;
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
@@ -1160,18 +1166,24 @@ static int bch2_mark_reservation(struct btree_trans *trans,
        return 0;
 }
 
-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+                                struct bkey_s_c_reflink_p p,
+                                u64 start, u64 end,
                                 u64 *idx, unsigned flags, size_t r_idx)
 {
+       struct bch_fs *c = trans->c;
        struct reflink_gc *r;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+       u64 next_idx = end;
        s64 ret = 0;
+       struct printbuf buf = PRINTBUF;
 
        if (r_idx >= c->reflink_gc_nr)
                goto not_found;
 
        r = genradix_ptr(&c->reflink_gc_table, r_idx);
-       if (*idx < r->offset - r->size)
+       next_idx = min(next_idx, r->offset - r->size);
+       if (*idx < next_idx)
                goto not_found;
 
        BUG_ON((s64) r->refcount + add < 0);
@@ -1180,37 +1192,37 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
        *idx = r->offset;
        return 0;
 not_found:
-       *idx = U64_MAX;
-       ret = -EIO;
-
-       /*
-        * XXX: we're replacing the entire reflink pointer with an error
-        * key, we should just be replacing the part that was missing:
-        */
-       if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
-                    p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+       if (fsck_err(c, "pointer to missing indirect extent\n"
+                    "  %s\n"
+                    "  missing range %llu-%llu",
+                    (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+                    *idx, next_idx)) {
                struct bkey_i_error new;
 
                bkey_init(&new.k);
                new.k.type      = KEY_TYPE_error;
-               new.k.p         = p.k->p;
-               new.k.size      = p.k->size;
-               ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+               new.k.p         = bkey_start_pos(p.k);
+               new.k.p.offset += *idx - start;
+               bch2_key_resize(&new.k, next_idx - *idx);
+               ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
        }
+
+       *idx = next_idx;
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
-static int bch2_mark_reflink_p(struct btree_trans *trans,
-                              struct bkey_s_c old, struct bkey_s_c new,
-                              unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+                       struct bkey_s_c old, struct bkey_s_c new,
+                       unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        struct reflink_gc *ref;
        size_t l, r, m;
-       u64 idx = le64_to_cpu(p.v->idx);
+       u64 idx = le64_to_cpu(p.v->idx), start = idx;
        u64 end = le64_to_cpu(p.v->idx) + p.k->size;
        int ret = 0;
 
@@ -1234,73 +1246,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
        }
 
        while (idx < end && !ret)
-               ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
-
-       return ret;
-}
-
-int bch2_mark_key(struct btree_trans *trans,
-                 struct bkey_s_c old,
-                 struct bkey_s_c new,
-                 unsigned flags)
-{
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
-       switch (k.k->type) {
-       case KEY_TYPE_alloc:
-       case KEY_TYPE_alloc_v2:
-       case KEY_TYPE_alloc_v3:
-               return bch2_mark_alloc(trans, old, new, flags);
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               return bch2_mark_extent(trans, old, new, flags);
-       case KEY_TYPE_stripe:
-               return bch2_mark_stripe(trans, old, new, flags);
-       case KEY_TYPE_inode:
-       case KEY_TYPE_inode_v2:
-               return bch2_mark_inode(trans, old, new, flags);
-       case KEY_TYPE_reservation:
-               return bch2_mark_reservation(trans, old, new, flags);
-       case KEY_TYPE_reflink_p:
-               return bch2_mark_reflink_p(trans, old, new, flags);
-       case KEY_TYPE_snapshot:
-               return bch2_mark_snapshot(trans, old, new, flags);
-       default:
-               return 0;
-       }
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
-                    struct bkey_i *new, unsigned flags)
-{
-       struct bkey             _deleted = KEY(0, 0, 0);
-       struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
-       struct bkey_s_c         old;
-       struct bkey             unpacked;
-       int ret;
-
-       _deleted.p = path->pos;
-
-       if (unlikely(flags & BTREE_TRIGGER_NORUN))
-               return 0;
-
-       if (!btree_node_type_needs_gc(path->btree_id))
-               return 0;
-
-       old = bch2_btree_path_peek_slot(path, &unpacked);
-
-       if (old.k->type == new->k.type &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-       } else {
-               ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|flags) ?:
-                       bch2_mark_key(trans, old, deleted,
-                               BTREE_TRIGGER_OVERWRITE|flags);
-       }
+               ret = __bch2_mark_reflink_p(trans, p, start, end,
+                                           &idx, flags, l++);
 
        return ret;
 }
@@ -1312,33 +1259,26 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
 
        bch_err(c, "disk usage increased %lli more than %u sectors reserved",
                should_not_have_added, disk_res_sectors);
 
        trans_for_each_update(trans, i) {
+               struct bkey_s_c old = { &i->old_k, i->old_v };
+
                pr_err("while inserting");
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-               pr_err("%s", buf);
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+               pr_err("  %s", buf.buf);
                pr_err("overlapping with");
-
-               if (!i->cached) {
-                       struct bkey u;
-                       struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
-
-                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
-                       pr_err("%s", buf);
-               } else {
-                       struct bkey_cached *ck = (void *) i->path->l[0].b;
-
-                       if (ck->valid) {
-                               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
-                               pr_err("%s", buf);
-                       }
-               }
+               printbuf_reset(&buf);
+               bch2_bkey_val_to_text(&buf, c, old);
+               pr_err("  %s", buf.buf);
        }
+
        __WARN();
+       printbuf_exit(&buf);
 }
 
 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
@@ -1419,53 +1359,44 @@ need_mark:
 
 /* trans_mark: */
 
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
-                             const struct bch_extent_ptr *ptr,
-                             struct bkey_alloc_unpacked *u)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
-                            POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
-                            BTREE_ITER_WITH_UPDATES|
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret) {
-               bch2_trans_iter_exit(trans, iter);
-               return ret;
-       }
-
-       *u = bch2_alloc_unpack(k);
-       return 0;
-}
-
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
-                       struct bkey_s_c k, struct extent_ptr_decoded p,
-                       s64 sectors, enum bch_data_type data_type)
+                                  enum btree_id btree_id, unsigned level,
+                                  struct bkey_s_c k, struct extent_ptr_decoded p,
+                                  unsigned flags)
 {
+       bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
        struct btree_iter iter;
-       struct bkey_alloc_unpacked u;
+       struct bkey_i_alloc_v4 *a;
+       struct bpos bucket_pos;
+       struct bch_backpointer bp;
+       s64 sectors;
        int ret;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-       if (ret)
-               return ret;
+       bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+       sectors = bp.bucket_len;
+       if (!insert)
+               sectors = -sectors;
 
-       ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
-                            u.gen, &u.data_type,
-                            &u.dirty_sectors, &u.cached_sectors);
-       if (ret)
-               goto out;
+       a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
-       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+                            a->v.gen, &a->v.data_type,
+                            &a->v.dirty_sectors, &a->v.cached_sectors);
        if (ret)
-               goto out;
-out:
+               goto err;
+
+       if (!p.ptr.cached) {
+               ret = insert
+                       ? bch2_bucket_backpointer_add(trans, a, bp, k)
+                       : bch2_bucket_backpointer_del(trans, a, bp, k);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1474,7 +1405,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                        struct extent_ptr_decoded p,
                        s64 sectors, enum bch_data_type data_type)
 {
-       struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_stripe *s;
@@ -1490,16 +1420,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                goto err;
 
        if (k.k->type != KEY_TYPE_stripe) {
-               bch2_fs_inconsistent(c,
+               bch2_trans_inconsistent(trans,
                        "pointer to nonexistent stripe %llu",
                        (u64) p.ec.idx);
-               bch2_inconsistent_error(c);
                ret = -EIO;
                goto err;
        }
 
        if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
-               bch2_fs_inconsistent(c,
+               bch2_trans_inconsistent(trans,
                        "stripe pointer doesn't match stripe %llu",
                        (u64) p.ec.idx);
                ret = -EIO;
@@ -1528,10 +1457,15 @@ err:
        return ret;
 }
 
-static int bch2_trans_mark_extent(struct btree_trans *trans,
-                       struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c old, struct bkey_i *new,
+                          unsigned flags)
 {
        struct bch_fs *c = trans->c;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+               ? old
+               : bkey_i_to_s_c(new);
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -1556,8 +1490,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
                if (flags & BTREE_TRIGGER_OVERWRITE)
                        disk_sectors = -disk_sectors;
 
-               ret = bch2_trans_mark_pointer(trans, k, p,
-                                       disk_sectors, data_type);
+               ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
                if (ret < 0)
                        return ret;
 
@@ -1593,7 +1526,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
        struct btree_iter iter;
-       struct bkey_alloc_unpacked u;
+       struct bkey_i_alloc_v4 *a;
        enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
                ? BCH_DATA_parity : 0;
        s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
@@ -1602,59 +1535,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
        if (deleting)
                sectors = -sectors;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
        ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
-                              u.gen, u.data_type,
-                              u.dirty_sectors, u.cached_sectors);
+                              a->v.gen, a->v.data_type,
+                              a->v.dirty_sectors, a->v.cached_sectors);
        if (ret)
                goto err;
 
        if (!deleting) {
-               if (bch2_fs_inconsistent_on(u.stripe ||
-                                           u.stripe_redundancy, c,
+               if (bch2_trans_inconsistent_on(a->v.stripe ||
+                                              a->v.stripe_redundancy, trans,
                                "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
-                               iter.pos.inode, iter.pos.offset, u.gen,
-                               bch2_data_types[u.data_type],
-                               u.dirty_sectors,
-                               u.stripe, s.k->p.offset)) {
+                               iter.pos.inode, iter.pos.offset, a->v.gen,
+                               bch2_data_types[a->v.data_type],
+                               a->v.dirty_sectors,
+                               a->v.stripe, s.k->p.offset)) {
                        ret = -EIO;
                        goto err;
                }
 
-               if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+               if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
                                "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
-                               iter.pos.inode, iter.pos.offset, u.gen,
-                               bch2_data_types[u.data_type],
-                               u.dirty_sectors,
+                               iter.pos.inode, iter.pos.offset, a->v.gen,
+                               bch2_data_types[a->v.data_type],
+                               a->v.dirty_sectors,
                                s.k->p.offset)) {
                        ret = -EIO;
                        goto err;
                }
 
-               u.stripe                = s.k->p.offset;
-               u.stripe_redundancy     = s.v->nr_redundant;
+               a->v.stripe             = s.k->p.offset;
+               a->v.stripe_redundancy  = s.v->nr_redundant;
        } else {
-               if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
-                                           u.stripe_redundancy != s.v->nr_redundant, c,
+               if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+                                              a->v.stripe_redundancy != s.v->nr_redundant, trans,
                                "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
-                               iter.pos.inode, iter.pos.offset, u.gen,
-                               s.k->p.offset, u.stripe)) {
+                               iter.pos.inode, iter.pos.offset, a->v.gen,
+                               s.k->p.offset, a->v.stripe)) {
                        ret = -EIO;
                        goto err;
                }
 
-               u.stripe                = 0;
-               u.stripe_redundancy     = 0;
+               a->v.stripe             = 0;
+               a->v.stripe_redundancy  = 0;
        }
 
-       u.dirty_sectors += sectors;
+       a->v.dirty_sectors += sectors;
        if (data_type)
-               u.data_type = !deleting ? data_type : 0;
+               a->v.data_type = !deleting ? data_type : 0;
 
-       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
        if (ret)
                goto err;
 err:
@@ -1662,66 +1595,69 @@ err:
        return ret;
 }
 
-static int bch2_trans_mark_stripe(struct btree_trans *trans,
-                                 struct bkey_s_c old, struct bkey_s_c new,
-                                 unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c old, struct bkey_i *new,
+                          unsigned flags)
 {
-       struct bkey_s_c_stripe old_s = { .k = NULL };
-       struct bkey_s_c_stripe new_s = { .k = NULL };
+       const struct bch_stripe *old_s = NULL;
+       struct bch_stripe *new_s = NULL;
        struct bch_replicas_padded r;
        unsigned i, nr_blocks;
        int ret = 0;
 
        if (old.k->type == KEY_TYPE_stripe)
-               old_s = bkey_s_c_to_stripe(old);
-       if (new.k->type == KEY_TYPE_stripe)
-               new_s = bkey_s_c_to_stripe(new);
+               old_s = bkey_s_c_to_stripe(old).v;
+       if (new->k.type == KEY_TYPE_stripe)
+               new_s = &bkey_i_to_stripe(new)->v;
 
        /*
         * If the pointers aren't changing, we don't need to do anything:
         */
-       if (new_s.k && old_s.k &&
-           new_s.v->nr_blocks          == old_s.v->nr_blocks &&
-           new_s.v->nr_redundant       == old_s.v->nr_redundant &&
-           !memcmp(old_s.v->ptrs, new_s.v->ptrs,
-                   new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+       if (new_s && old_s &&
+           new_s->nr_blocks    == old_s->nr_blocks &&
+           new_s->nr_redundant == old_s->nr_redundant &&
+           !memcmp(old_s->ptrs, new_s->ptrs,
+                   new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
                return 0;
 
-       BUG_ON(new_s.k && old_s.k &&
-              (new_s.v->nr_blocks      != old_s.v->nr_blocks ||
-               new_s.v->nr_redundant   != old_s.v->nr_redundant));
+       BUG_ON(new_s && old_s &&
+              (new_s->nr_blocks        != old_s->nr_blocks ||
+               new_s->nr_redundant     != old_s->nr_redundant));
 
-       nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+       nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
 
-       if (new_s.k) {
-               s64 sectors = le16_to_cpu(new_s.v->sectors);
+       if (new_s) {
+               s64 sectors = le16_to_cpu(new_s->sectors);
 
-               bch2_bkey_to_replicas(&r.e, new);
-               update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
+               bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+               update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
        }
 
-       if (old_s.k) {
-               s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+       if (old_s) {
+               s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
 
                bch2_bkey_to_replicas(&r.e, old);
-               update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+               update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
        }
 
        for (i = 0; i < nr_blocks; i++) {
-               if (new_s.k && old_s.k &&
-                   !memcmp(&new_s.v->ptrs[i],
-                           &old_s.v->ptrs[i],
-                           sizeof(new_s.v->ptrs[i])))
+               if (new_s && old_s &&
+                   !memcmp(&new_s->ptrs[i],
+                           &old_s->ptrs[i],
+                           sizeof(new_s->ptrs[i])))
                        continue;
 
-               if (new_s.k) {
-                       ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
+               if (new_s) {
+                       ret = bch2_trans_mark_stripe_bucket(trans,
+                                       bkey_i_to_s_c_stripe(new), i, false);
                        if (ret)
                                break;
                }
 
-               if (old_s.k) {
-                       ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+               if (old_s) {
+                       ret = bch2_trans_mark_stripe_bucket(trans,
+                                       bkey_s_c_to_stripe(old), i, true);
                        if (ret)
                                break;
                }
@@ -1730,12 +1666,13 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_trans_mark_inode(struct btree_trans *trans,
-                                struct bkey_s_c old,
-                                struct bkey_s_c new,
-                                unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
+                         struct bkey_s_c old,
+                         struct bkey_i *new,
+                         unsigned flags)
 {
-       int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+       int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
 
        if (nr) {
                struct replicas_delta_list *d =
@@ -1746,9 +1683,15 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_trans_mark_reservation(struct btree_trans *trans,
-                                      struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
+                               struct bkey_s_c old,
+                               struct bkey_i *new,
+                               unsigned flags)
 {
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+               ? old
+               : bkey_i_to_s_c(new);
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
        struct replicas_delta_list *d;
@@ -1776,7 +1719,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        struct bkey_i *n;
        __le64 *refcount;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
@@ -1796,19 +1739,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
        refcount = bkey_refcount(n);
        if (!refcount) {
-               bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
-               bch2_fs_inconsistent(c,
+               bch2_bkey_val_to_text(&buf, c, p.s_c);
+               bch2_trans_inconsistent(trans,
                        "nonexistent indirect extent at %llu while marking\n  %s",
-                       *idx, buf);
+                       *idx, buf.buf);
                ret = -EIO;
                goto err;
        }
 
        if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
-               bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
-               bch2_fs_inconsistent(c,
+               bch2_bkey_val_to_text(&buf, c, p.s_c);
+               bch2_trans_inconsistent(trans,
                        "indirect extent refcount underflow at %llu while marking\n  %s",
-                       *idx, buf);
+                       *idx, buf.buf);
                ret = -EIO;
                goto err;
        }
@@ -1830,11 +1773,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
        le64_add_cpu(refcount, add);
 
-       if (!*refcount) {
-               n->k.type = KEY_TYPE_deleted;
-               set_bkey_val_u64s(&n->k, 0);
-       }
-
        bch2_btree_iter_set_pos_to_extent_start(&iter);
        ret = bch2_trans_update(trans, &iter, n, 0);
        if (ret)
@@ -1843,12 +1781,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        *idx = k.k->p.offset;
 err:
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
        return ret;
 }
 
-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-                                    struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c old,
+                             struct bkey_i *new,
+                             unsigned flags)
 {
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+               ? old
+               : bkey_i_to_s_c(new);
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        u64 idx, end_idx;
        int ret = 0;
@@ -1869,31 +1814,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
        return ret;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-                       struct bkey_s_c new, unsigned flags)
-{
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               return bch2_trans_mark_extent(trans, k, flags);
-       case KEY_TYPE_stripe:
-               return bch2_trans_mark_stripe(trans, old, new, flags);
-       case KEY_TYPE_inode:
-       case KEY_TYPE_inode_v2:
-               return bch2_trans_mark_inode(trans, old, new, flags);
-       case KEY_TYPE_reservation:
-               return bch2_trans_mark_reservation(trans, k, flags);
-       case KEY_TYPE_reflink_p:
-               return bch2_trans_mark_reflink_p(trans, k, flags);
-       default:
-               return 0;
-       }
-}
-
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    struct bch_dev *ca, size_t b,
                                    enum bch_data_type type,
@@ -1901,11 +1821,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
-       struct bkey_alloc_unpacked u;
-       struct bch_extent_ptr ptr = {
-               .dev = ca->dev_idx,
-               .offset = bucket_to_sector(ca, b),
-       };
+       struct bkey_i_alloc_v4 *a;
        int ret = 0;
 
        /*
@@ -1914,26 +1830,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        if (b >= ca->mi.nbuckets)
                return 0;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
-       if (u.data_type && u.data_type != type) {
+       if (a->v.data_type && a->v.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
-                       iter.pos.inode, iter.pos.offset, u.gen,
-                       bch2_data_types[u.data_type],
+                       iter.pos.inode, iter.pos.offset, a->v.gen,
+                       bch2_data_types[a->v.data_type],
                        bch2_data_types[type],
                        bch2_data_types[type]);
                ret = -EIO;
                goto out;
        }
 
-       u.data_type     = type;
-       u.dirty_sectors = sectors;
+       a->v.data_type          = type;
+       a->v.dirty_sectors      = sectors;
 
-       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
        if (ret)
                goto out;
 out:
@@ -1946,7 +1862,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    enum bch_data_type type,
                                    unsigned sectors)
 {
-       return __bch2_trans_do(trans, NULL, NULL, 0,
+       return commit_do(trans, NULL, NULL, 0,
                        __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
 }
 
@@ -2024,8 +1940,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-       return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                       __bch2_trans_mark_dev_sb(&trans, ca));
+       return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
 }
 
 /* Disk reservations: */
@@ -2085,7 +2000,7 @@ recalculate:
                ret = 0;
        } else {
                atomic64_set(&c->sectors_available, sectors_available);
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_disk_reservation;
        }
 
        mutex_unlock(&c->sectors_available_lock);
@@ -2096,16 +2011,6 @@ recalculate:
 
 /* Startup/shutdown: */
 
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
-       struct bucket_array *buckets =
-               container_of(rcu, struct bucket_array, rcu);
-
-       kvpfree(buckets,
-               sizeof(*buckets) +
-               buckets->nbuckets * sizeof(struct bucket));
-}
-
 static void bucket_gens_free_rcu(struct rcu_head *rcu)
 {
        struct bucket_gens *buckets =
@@ -2116,46 +2021,19 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-       struct bucket_array *buckets = NULL, *old_buckets = NULL;
        struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
        unsigned long *buckets_nouse = NULL;
-       alloc_fifo      free[RESERVE_NR];
-       alloc_fifo      free_inc;
-       alloc_heap      alloc_heap;
-
-       size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
-                            ca->mi.bucket_size / btree_sectors(c));
-       /* XXX: these should be tunable */
-       size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
-       size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
-                                     btree_reserve * 2);
-       bool resize = ca->buckets[0] != NULL;
+       bool resize = ca->bucket_gens != NULL;
        int ret = -ENOMEM;
-       unsigned i;
-
-       memset(&free,           0, sizeof(free));
-       memset(&free_inc,       0, sizeof(free_inc));
-       memset(&alloc_heap,     0, sizeof(alloc_heap));
 
-       if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
-                                           nbuckets * sizeof(struct bucket),
-                                           GFP_KERNEL|__GFP_ZERO)) ||
-           !(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+       if (!(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
                                            GFP_KERNEL|__GFP_ZERO)) ||
            (c->opts.buckets_nouse &&
             !(buckets_nouse    = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
-                                           GFP_KERNEL|__GFP_ZERO))) ||
-           !init_fifo(&free[RESERVE_MOVINGGC],
-                      copygc_reserve, GFP_KERNEL) ||
-           !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-           !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
-           !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+                                           GFP_KERNEL|__GFP_ZERO))))
                goto err;
 
-       buckets->first_bucket   = ca->mi.first_bucket;
-       buckets->nbuckets       = nbuckets;
        bucket_gens->first_bucket = ca->mi.first_bucket;
        bucket_gens->nbuckets   = nbuckets;
 
@@ -2167,15 +2045,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                percpu_down_write(&c->mark_lock);
        }
 
-       old_buckets = bucket_array(ca);
        old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
        if (resize) {
-               size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+               size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
 
-               memcpy(buckets->b,
-                      old_buckets->b,
-                      n * sizeof(struct bucket));
                memcpy(bucket_gens->b,
                       old_bucket_gens->b,
                       n);
@@ -2185,47 +2059,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                               BITS_TO_LONGS(n) * sizeof(unsigned long));
        }
 
-       rcu_assign_pointer(ca->buckets[0], buckets);
        rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-       buckets         = old_buckets;
        bucket_gens     = old_bucket_gens;
 
        swap(ca->buckets_nouse, buckets_nouse);
 
+       nbuckets = ca->mi.nbuckets;
+
        if (resize) {
                percpu_up_write(&c->mark_lock);
+               up_write(&ca->bucket_lock);
                up_write(&c->gc_lock);
        }
 
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++) {
-               fifo_move(&free[i], &ca->free[i]);
-               swap(ca->free[i], free[i]);
-       }
-       fifo_move(&free_inc, &ca->free_inc);
-       swap(ca->free_inc, free_inc);
-       spin_unlock(&c->freelist_lock);
-
-       /* with gc lock held, alloc_heap can't be in use: */
-       swap(ca->alloc_heap, alloc_heap);
-
-       nbuckets = ca->mi.nbuckets;
-
-       if (resize)
-               up_write(&ca->bucket_lock);
-
        ret = 0;
 err:
-       free_heap(&alloc_heap);
-       free_fifo(&free_inc);
-       for (i = 0; i < RESERVE_NR; i++)
-               free_fifo(&free[i]);
        kvpfree(buckets_nouse,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
        if (bucket_gens)
                call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-       if (buckets)
-               call_rcu(&buckets->rcu, buckets_free_rcu);
 
        return ret;
 }
@@ -2234,17 +2086,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
        unsigned i;
 
-       free_heap(&ca->alloc_heap);
-       free_fifo(&ca->free_inc);
-       for (i = 0; i < RESERVE_NR; i++)
-               free_fifo(&ca->free[i]);
        kvpfree(ca->buckets_nouse,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
                sizeof(struct bucket_gens) + ca->mi.nbuckets);
-       kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
-               sizeof(struct bucket_array) +
-               ca->mi.nbuckets * sizeof(struct bucket));
 
        for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
                free_percpu(ca->usage[i]);
@@ -2265,5 +2110,5 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
                        return -ENOMEM;
        }
 
-       return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+       return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 }
index 7c6c59c7762c55cb4626af78aff84d55a9c5ded1..56c06ccde14f8a39a19ded9475d6861624f35d63 100644 (file)
@@ -9,58 +9,39 @@
 #define _BUCKETS_H
 
 #include "buckets_types.h"
+#include "extents.h"
 #include "super.h"
 
 #define for_each_bucket(_b, _buckets)                          \
        for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
             _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
-#define bucket_cmpxchg(g, new, expr)                           \
-({                                                             \
-       struct bucket *_g = g;                                  \
-       u64 _v = atomic64_read(&(g)->_mark.v);                  \
-       struct bucket_mark _old;                                \
-                                                               \
-       do {                                                    \
-               (new).v.counter = _old.v.counter = _v;          \
-               expr;                                           \
-       } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,         \
-                              _old.v.counter,                  \
-                              (new).v.counter)) != _old.v.counter);\
-       _old;                                                   \
-})
-
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
-                                                 bool gc)
+static inline void bucket_unlock(struct bucket *b)
 {
-       return rcu_dereference_check(ca->buckets[gc],
-                                    !ca->fs ||
-                                    percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-                                    lockdep_is_held(&ca->fs->gc_lock) ||
-                                    lockdep_is_held(&ca->bucket_lock));
+       smp_store_release(&b->lock, 0);
 }
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_lock(struct bucket *b)
 {
-       return __bucket_array(ca, false);
+       while (xchg(&b->lock, 1))
+               cpu_relax();
 }
 
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
 {
-       struct bucket_array *buckets = __bucket_array(ca, gc);
-
-       BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
-       return buckets->b + b;
+       return rcu_dereference_check(ca->buckets_gc,
+                                    !ca->fs ||
+                                    percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+                                    lockdep_is_held(&ca->fs->gc_lock) ||
+                                    lockdep_is_held(&ca->bucket_lock));
 }
 
 static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 {
-       return __bucket(ca, b, true);
-}
+       struct bucket_array *buckets = gc_bucket_array(ca);
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
-       return __bucket(ca, b, false);
+       BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+       return buckets->b + b;
 }
 
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
@@ -70,7 +51,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
                                     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
                                     lockdep_is_held(&ca->fs->gc_lock) ||
                                     lockdep_is_held(&ca->bucket_lock));
-
 }
 
 static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
@@ -81,20 +61,27 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
        return gens->b + b;
 }
 
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+                                  const struct bch_extent_ptr *ptr)
 {
-       return g->mark.gen - g->oldest_gen;
+       return sector_to_bucket(ca, ptr->offset);
 }
 
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
                                   const struct bch_extent_ptr *ptr)
 {
-       return sector_to_bucket(ca, ptr->offset);
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+                                               const struct bch_extent_ptr *ptr,
+                                               u32 *bucket_offset)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
 }
 
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
@@ -106,13 +93,22 @@ static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
 static inline enum bch_data_type ptr_data_type(const struct bkey *k,
                                               const struct bch_extent_ptr *ptr)
 {
-       if (k->type == KEY_TYPE_btree_ptr ||
-           k->type == KEY_TYPE_btree_ptr_v2)
+       if (bkey_is_btree_ptr(k))
                return BCH_DATA_btree;
 
        return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+       EBUG_ON(sectors < 0);
+
+       return crc_is_compressed(p.crc)
+               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+                                  p.crc.uncompressed_size)
+               : sectors;
+}
+
 static inline int gen_cmp(u8 a, u8 b)
 {
        return (s8) (a - b);
@@ -141,62 +137,73 @@ static inline u8 ptr_stale(struct bch_dev *ca,
        return ret;
 }
 
-/* bucket gc marks */
+/* Device usage: */
 
-static inline bool is_available_bucket(struct bucket_mark mark)
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
-       return !mark.dirty_sectors && !mark.stripe;
-}
+       struct bch_dev_usage ret;
 
-/* Device usage: */
+       bch2_dev_usage_read_fast(ca, &ret);
+       return ret;
+}
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+void bch2_dev_usage_init(struct bch_dev *);
 
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-                                         struct bch_dev_usage stats)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
 {
-       u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
-
-       if (WARN_ONCE(stats.buckets_unavailable > total,
-                     "buckets_unavailable overflow (%llu > %llu)\n",
-                     stats.buckets_unavailable, total))
-               return 0;
-
-       return total - stats.buckets_unavailable;
+       s64 reserved = 0;
+
+       switch (reserve) {
+       case RESERVE_none:
+               reserved += ca->mi.nbuckets >> 6;
+               fallthrough;
+       case RESERVE_movinggc:
+               reserved += ca->nr_btree_reserve;
+               fallthrough;
+       case RESERVE_btree:
+               reserved += ca->nr_btree_reserve;
+               fallthrough;
+       case RESERVE_btree_movinggc:
+               break;
+       }
+
+       return reserved;
 }
 
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+                                  struct bch_dev_usage usage,
+                                  enum alloc_reserve reserve)
 {
-       return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+       return max_t(s64, 0,
+                    usage.d[BCH_DATA_free].buckets -
+                    ca->nr_open_buckets -
+                    bch2_dev_buckets_reserved(ca, reserve));
 }
 
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
-                                           struct bch_dev_usage stats)
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+                                         struct bch_dev_usage usage,
+                                         enum alloc_reserve reserve)
 {
-       struct bch_fs *c = ca->fs;
-       s64 available = __dev_buckets_available(ca, stats);
-       unsigned i;
-
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++)
-               available -= fifo_used(&ca->free[i]);
-       available -= fifo_used(&ca->free_inc);
-       available -= ca->nr_open_buckets;
-       spin_unlock(&c->freelist_lock);
-
-       return max(available, 0LL);
+       return max_t(s64, 0,
+                      usage.d[BCH_DATA_free].buckets
+                    + usage.d[BCH_DATA_cached].buckets
+                    + usage.d[BCH_DATA_need_gc_gens].buckets
+                    + usage.d[BCH_DATA_need_discard].buckets
+                    - ca->nr_open_buckets
+                    - bch2_dev_buckets_reserved(ca, reserve));
 }
 
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+                                       enum alloc_reserve reserve)
 {
-       return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+       return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
 }
 
 /* Filesystem usage: */
 
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-
        return sizeof(struct bch_fs_usage) / sizeof(u64) +
                READ_ONCE(c->replicas.nr);
 }
@@ -224,18 +231,23 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
-                              size_t, enum bch_data_type, unsigned,
-                              struct gc_pos, unsigned);
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+                             size_t, enum bch_data_type, unsigned,
+                             struct gc_pos, unsigned);
 
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_mark_update(struct btree_trans *, struct btree_path *,
-                    struct bkey_i *, unsigned);
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-                       struct bkey_s_c, unsigned);
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
index 2c73dc60b838f08f42da26ded8c6b8a0358a4b20..1dbba7d906dd883a12db432766a02fb527e56207 100644 (file)
@@ -7,32 +7,15 @@
 
 #define BUCKET_JOURNAL_SEQ_BITS                16
 
-struct bucket_mark {
-       union {
-       atomic64_t      v;
-
-       struct {
-       u8              gen;
-       u8              data_type:3,
-                       owned_by_allocator:1,
-                       stripe:1;
-       u16             dirty_sectors;
-       u16             cached_sectors;
-       };
-       };
-};
-
 struct bucket {
-       union {
-               struct bucket_mark      _mark;
-               const struct bucket_mark mark;
-       };
-
-       u64                             io_time[2];
-       u8                              oldest_gen;
-       unsigned                        gen_valid:1;
-       u8                              stripe_redundancy;
-       u32                             stripe;
+       u8                      lock;
+       u8                      gen_valid:1;
+       u8                      data_type:7;
+       u8                      gen;
+       u8                      stripe_redundancy;
+       u32                     stripe;
+       u32                     dirty_sectors;
+       u32                     cached_sectors;
 };
 
 struct bucket_array {
@@ -51,7 +34,6 @@ struct bucket_gens {
 
 struct bch_dev_usage {
        u64                     buckets_ec;
-       u64                     buckets_unavailable;
 
        struct {
                u64             buckets;
@@ -111,9 +93,9 @@ struct copygc_heap_entry {
        u8                      dev;
        u8                      gen;
        u8                      replicas;
-       u16                     fragmentation;
+       u32                     fragmentation;
        u32                     sectors;
-       u64                     offset;
+       u64                     bucket;
 };
 
 typedef HEAP(struct copygc_heap_entry) copygc_heap;
index aa26588ed5edf0e193c10cc7e3d6a0d25f49a8e9..dbb7e5e0b35b8147cd18529827b8fdb3489e6b73 100644 (file)
@@ -501,13 +501,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
        arg.state               = ca->mi.state;
        arg.bucket_size         = ca->mi.bucket_size;
        arg.nr_buckets          = ca->mi.nbuckets - ca->mi.first_bucket;
-       arg.available_buckets   = arg.nr_buckets - src.buckets_unavailable;
-       arg.ec_buckets          = src.buckets_ec;
-       arg.ec_sectors          = 0;
+       arg.buckets_ec          = src.buckets_ec;
 
        for (i = 0; i < BCH_DATA_NR; i++) {
-               arg.buckets[i] = src.d[i].buckets;
-               arg.sectors[i] = src.d[i].sectors;
+               arg.d[i].buckets        = src.d[i].buckets;
+               arg.d[i].sectors        = src.d[i].sectors;
+               arg.d[i].fragmented     = src.d[i].fragmented;
        }
 
        percpu_ref_put(&ca->ref);
index a1d89923d361a976aeed0f7de3b7ebb0658df495..3268e8d48603372f6f6178e346393edb068ede7c 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "checksum.h"
+#include "errcode.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -93,9 +94,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
        }
 }
 
-static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
-                                struct nonce nonce,
-                                struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+                               struct nonce nonce,
+                               struct scatterlist *sg, size_t len)
 {
        SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
        int ret;
@@ -104,17 +105,51 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
        skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
 
        ret = crypto_skcipher_encrypt(req);
-       BUG_ON(ret);
+       if (ret)
+               pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+       return ret;
 }
 
-static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
                              struct nonce nonce,
                              void *buf, size_t len)
 {
-       struct scatterlist sg;
+       if (!is_vmalloc_addr(buf)) {
+               struct scatterlist sg;
+
+               sg_init_table(&sg, 1);
+               sg_set_page(&sg,
+                           is_vmalloc_addr(buf)
+                           ? vmalloc_to_page(buf)
+                           : virt_to_page(buf),
+                           len, offset_in_page(buf));
+               return do_encrypt_sg(tfm, nonce, &sg, len);
+       } else {
+               unsigned pages = buf_pages(buf, len);
+               struct scatterlist *sg;
+               size_t orig_len = len;
+               int ret, i;
+
+               sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+               if (!sg)
+                       return -ENOMEM;
+
+               sg_init_table(sg, pages);
+
+               for (i = 0; i < pages; i++) {
+                       unsigned offset = offset_in_page(buf);
+                       unsigned pg_len = min(len, PAGE_SIZE - offset);
+
+                       sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+                       buf += pg_len;
+                       len -= pg_len;
+               }
 
-       sg_init_one(&sg, buf, len);
-       do_encrypt_sg(tfm, nonce, &sg, len);
+               ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+               kfree(sg);
+               return ret;
+       }
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
@@ -136,25 +171,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
                goto err;
        }
 
-       do_encrypt(chacha20, nonce, buf, len);
+       ret = do_encrypt(chacha20, nonce, buf, len);
 err:
        crypto_free_sync_skcipher(chacha20);
        return ret;
 }
 
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
-                        struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+                       struct nonce nonce)
 {
        u8 key[POLY1305_KEY_SIZE];
+       int ret;
 
        nonce.d[3] ^= BCH_NONCE_POLY;
 
        memset(key, 0, sizeof(key));
-       do_encrypt(c->chacha20, nonce, key, sizeof(key));
+       ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+       if (ret)
+               return ret;
 
        desc->tfm = c->poly1305;
        crypto_shash_init(desc);
        crypto_shash_update(desc, key, sizeof(key));
+       return 0;
 }
 
 struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -196,13 +235,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
        }
 }
 
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
                  struct nonce nonce, void *data, size_t len)
 {
        if (!bch2_csum_type_is_encryption(type))
-               return;
+               return 0;
 
-       do_encrypt(c->chacha20, nonce, data, len);
+       return do_encrypt(c->chacha20, nonce, data, len);
 }
 
 static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -277,23 +316,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
        return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 }
 
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-                     struct nonce nonce, struct bio *bio)
+int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+                    struct nonce nonce, struct bio *bio)
 {
        struct bio_vec bv;
        struct bvec_iter iter;
        struct scatterlist sgl[16], *sg = sgl;
        size_t bytes = 0;
+       int ret = 0;
 
        if (!bch2_csum_type_is_encryption(type))
-               return;
+               return 0;
 
        sg_init_table(sgl, ARRAY_SIZE(sgl));
 
        bio_for_each_segment(bv, bio, iter) {
                if (sg == sgl + ARRAY_SIZE(sgl)) {
                        sg_mark_end(sg - 1);
-                       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+                       ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+                       if (ret)
+                               return ret;
 
                        nonce = nonce_add(nonce, bytes);
                        bytes = 0;
@@ -307,7 +350,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
        }
 
        sg_mark_end(sg - 1);
-       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+       return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 }
 
 struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -383,8 +426,17 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
                merged = bch2_checksum_bio(c, crc_old.csum_type,
                                extent_nonce(version, crc_old), bio);
 
-       if (bch2_crc_cmp(merged, crc_old.csum))
+       if (bch2_crc_cmp(merged, crc_old.csum)) {
+               bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+                       "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+                       crc_old.csum.hi,
+                       crc_old.csum.lo,
+                       merged.hi,
+                       merged.lo,
+                       bch2_csum_types[crc_old.csum_type],
+                       bch2_csum_types[new_csum_type]);
                return -EIO;
+       }
 
        for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
                if (i->crc)
@@ -413,7 +465,7 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
        const struct user_key_payload *ukp;
        int ret;
 
-       keyring_key = request_key(&key_type_logon, key_description, NULL);
+       keyring_key = request_key(&key_type_user, key_description, NULL);
        if (IS_ERR(keyring_key))
                return PTR_ERR(keyring_key);
 
@@ -451,13 +503,15 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 {
-       char key_description[60];
-       char uuid[40];
+       struct printbuf key_description = PRINTBUF;
+       int ret;
 
-       uuid_unparse_lower(sb->user_uuid.b, uuid);
-       sprintf(key_description, "bcachefs:%s", uuid);
+       prt_printf(&key_description, "bcachefs:");
+       pr_uuid(&key_description, sb->user_uuid.b);
 
-       return __bch2_request_key(key_description, key);
+       ret = __bch2_request_key(key_description.buf, key);
+       printbuf_exit(&key_description);
+       return ret;
 }
 
 int bch2_decrypt_sb_key(struct bch_fs *c,
@@ -474,7 +528,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
        ret = bch2_request_key(c->disk_sb.sb, &user_key);
        if (ret) {
-               bch_err(c, "error requesting encryption key: %i", ret);
+               bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
                goto err;
        }
 
@@ -499,20 +553,24 @@ err:
 
 static int bch2_alloc_ciphers(struct bch_fs *c)
 {
+       int ret;
+
        if (!c->chacha20)
                c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
-       if (IS_ERR(c->chacha20)) {
-               bch_err(c, "error requesting chacha20 module: %li",
-                       PTR_ERR(c->chacha20));
-               return PTR_ERR(c->chacha20);
+       ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+       if (ret) {
+               bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+               return ret;
        }
 
        if (!c->poly1305)
                c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-       if (IS_ERR(c->poly1305)) {
-               bch_err(c, "error requesting poly1305 module: %li",
-                       PTR_ERR(c->poly1305));
-               return PTR_ERR(c->poly1305);
+       ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+       if (ret) {
+               bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+               return ret;
        }
 
        return 0;
@@ -573,7 +631,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
        if (keyed) {
                ret = bch2_request_key(c->disk_sb.sb, &user_key);
                if (ret) {
-                       bch_err(c, "error requesting encryption key: %i", ret);
+                       bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
                        goto err;
                }
 
@@ -625,9 +683,9 @@ int bch2_fs_encryption_init(struct bch_fs *c)
        pr_verbose_init(c->opts, "");
 
        c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-       if (IS_ERR(c->sha256)) {
-               bch_err(c, "error requesting sha256 module");
-               ret = PTR_ERR(c->sha256);
+       ret = PTR_ERR_OR_ZERO(c->sha256);
+       if (ret) {
+               bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
                goto out;
        }
 
index f5c1a609c5c42fe408a5896498c1aeed748679ff..c86c3c05d62054a66faffa4c5c2cde81ab5de1e8 100644 (file)
@@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
 
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
                 void *data, size_t);
 
 struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
                        struct bch_extent_crc_unpacked *,
                        unsigned, unsigned, unsigned);
 
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
-                   struct nonce, struct bio *);
+int bch2_encrypt_bio(struct bch_fs *, unsigned,
+                    struct nonce, struct bio *);
 
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
                        struct bch_key *);
index 4324cfe7eed0de48ef2a26d97b024d3bb5d712b5..f3ffdbc38485baacb66cdd39097e4018440b0fd0 100644 (file)
@@ -161,7 +161,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
        now = atomic64_read(&clock->now);
 
        for (i = 0; i < clock->timers.used; i++)
-               pr_buf(out, "%ps:\t%li\n",
+               prt_printf(out, "%ps:\t%li\n",
                       clock->timers.data[i]->fn,
                       clock->timers.data[i]->expire - now);
        spin_unlock(&clock->timer_lock);
index 8e4179d8dc2764d883916cbca64d4a11deb98d4a..2b7080b67ecac518d297b77c953c2c6374f9bd23 100644 (file)
@@ -197,9 +197,9 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
                        goto err;
 
                workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
-               ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+               ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
 
-               ret = ZSTD_decompressDCtx(ctx,
+               ret = zstd_decompress_dctx(ctx,
                                dst_data,       dst_len,
                                src_data.b + 4, real_src_len);
 
@@ -333,8 +333,8 @@ static int attempt_compress(struct bch_fs *c,
                return strm.total_out;
        }
        case BCH_COMPRESSION_TYPE_zstd: {
-               ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
-                       ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+               ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+                       zstd_cctx_workspace_bound(&c->zstd_params.cParams));
 
                /*
                 * ZSTD requires that when we decompress we pass in the exact
@@ -347,11 +347,11 @@ static int attempt_compress(struct bch_fs *c,
                 * factor (7 bytes) from the dst buffer size to account for
                 * that.
                 */
-               size_t len = ZSTD_compressCCtx(ctx,
+               size_t len = zstd_compress_cctx(ctx,
                                dst + 4,        dst_len - 4 - 7,
                                src,            src_len,
-                               c->zstd_params);
-               if (ZSTD_isError(len))
+                               &c->zstd_params);
+               if (zstd_is_error(len))
                        return 0;
 
                *((__le32 *) dst) = cpu_to_le32(len);
@@ -377,7 +377,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 
        /* If it's only one block, don't bother trying to compress: */
        if (src->bi_iter.bi_size <= c->opts.block_size)
-               return 0;
+               return BCH_COMPRESSION_TYPE_incompressible;
 
        dst_data = bio_map_or_bounce(c, dst, WRITE);
        src_data = bio_map_or_bounce(c, src, READ);
@@ -546,7 +546,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
        size_t decompress_workspace_size = 0;
        bool decompress_workspace_needed;
-       ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
+       ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
        struct {
                unsigned        feature;
                unsigned        type;
@@ -558,8 +558,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                        zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
                        zlib_inflate_workspacesize(), },
                { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
-                       ZSTD_CCtxWorkspaceBound(params.cParams),
-                       ZSTD_DCtxWorkspaceBound() },
+                       zstd_cctx_workspace_bound(&params.cParams),
+                       zstd_dctx_workspace_bound() },
        }, *i;
        int ret = 0;
 
diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c
new file mode 100644 (file)
index 0000000..edd1b25
--- /dev/null
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+       if (!ctrs)
+               return 0;
+
+       return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+                                    struct bch_sb_field *f,
+                                    struct printbuf *err)
+{
+       return 0;
+};
+
+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+                             struct bch_sb_field *f)
+{
+       struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+       for (i = 0; i < nr; i++) {
+               if (i < BCH_COUNTER_NR)
+                       prt_printf(out, "%s ", bch2_counter_names[i]);
+               else
+                       prt_printf(out, "(unknown)");
+
+               prt_tab(out);
+               prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+               prt_newline(out);
+       };
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+       u64 val = 0;
+
+       for (i = 0; i < BCH_COUNTER_NR; i++)
+               c->counters_on_mount[i] = 0;
+
+       for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+               val = le64_to_cpu(ctrs->d[i]);
+               percpu_u64_set(&c->counters[i], val);
+               c->counters_on_mount[i] = val;
+       }
+       return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       struct bch_sb_field_counters *ret;
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+       if (nr < BCH_COUNTER_NR) {
+               ret = bch2_sb_resize_counters(&c->disk_sb,
+                                              sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+               if (ret) {
+                       ctrs = ret;
+                       nr = bch2_sb_counter_nr_entries(ctrs);
+               }
+       }
+
+
+       for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+               ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+       return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+       free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+       c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+       if (!c->counters)
+               return -ENOMEM;
+
+       return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+       .validate       = bch2_sb_counters_validate,
+       .to_text        = bch2_sb_counters_to_text,
+};
diff --git a/libbcachefs/counters.h b/libbcachefs/counters.h
new file mode 100644 (file)
index 0000000..4778aa1
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h
new file mode 100644 (file)
index 0000000..519ab9b
--- /dev/null
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type)                                                   \
+struct {                                                               \
+       size_t nr, size;                                                \
+       type *data;                                                     \
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
+{
+       if (d->nr + more > d->size) {
+               size_t new_size = roundup_pow_of_two(d->nr + more);
+               void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
+
+               if (!data)
+                       return -ENOMEM;
+
+               d->data = data;
+               d->size = new_size;
+       }
+
+       return 0;
+}
+
+#define darray_make_room(_d, _more)                                    \
+       __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
+
+#define darray_top(_d)         ((_d).data[(_d).nr])
+
+#define darray_push(_d, _item)                                         \
+({                                                                     \
+       int _ret = darray_make_room((_d), 1);                           \
+                                                                       \
+       if (!_ret)                                                      \
+               (_d)->data[(_d)->nr++] = (_item);                       \
+       _ret;                                                           \
+})
+
+#define darray_insert_item(_d, _pos, _item)                            \
+({                                                                     \
+       size_t pos = (_pos);                                            \
+       int _ret = darray_make_room((_d), 1);                           \
+                                                                       \
+       if (!_ret)                                                      \
+               array_insert_item((_d)->data, (_d)->nr, pos, (_item));  \
+       _ret;                                                           \
+})
+
+#define darray_for_each(_d, _i)                                                \
+       for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d)                                                        \
+do {                                                                   \
+       (_d)->data = NULL;                                              \
+       (_d)->nr = (_d)->size = 0;                                      \
+} while (0)
+
+#define darray_exit(_d)                                                        \
+do {                                                                   \
+       kfree((_d)->data);                                              \
+       darray_init(_d);                                                \
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
new file mode 100644 (file)
index 0000000..b75ff07
--- /dev/null
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "subvolume.h"
+
+#include <trace/events/bcachefs.h>
+
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+                                    enum btree_id id,
+                                    struct bpos old_pos,
+                                    struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, update_iter;
+       struct bkey_s_c k;
+       snapshot_id_list s;
+       int ret;
+
+       if (!btree_type_has_snapshots(id))
+               return 0;
+
+       darray_init(&s);
+
+       if (!bkey_cmp(old_pos, new_pos))
+               return 0;
+
+       if (!snapshot_t(c, old_pos.snapshot)->children[0])
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (bkey_cmp(old_pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+                       struct bkey_i *update;
+
+                       if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
+                               continue;
+
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p = new_pos;
+                       update->k.p.snapshot = k.k->p.snapshot;
+
+                       bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                       bch2_trans_iter_exit(trans, &update_iter);
+                       if (ret)
+                               break;
+
+                       ret = snapshot_list_add(c, &s, k.k->p.snapshot);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+       darray_exit(&s);
+
+       return ret;
+}
+
+static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
+{
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       struct bch_extent_ptr *ptr;
+
+       bkey_for_each_ptr(ptrs, ptr)
+               if (ptr->dev == dev)
+                       ptr->cached = true;
+}
+
+static int bch2_data_update_index_update(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct data_update *m =
+               container_of(op, struct data_update, op);
+       struct keylist *keys = &op->insert_keys;
+       struct bkey_buf _new, _insert;
+       int ret = 0;
+
+       bch2_bkey_buf_init(&_new);
+       bch2_bkey_buf_init(&_insert);
+       bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+       bch2_trans_iter_init(&trans, &iter, m->btree_id,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       while (1) {
+               struct bkey_s_c k;
+               struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+               struct bkey_i *insert;
+               struct bkey_i_extent *new;
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+               struct bpos next_pos;
+               bool did_work = false;
+               bool should_check_enospc;
+               s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+               unsigned i;
+
+               bch2_trans_begin(&trans);
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+               if (!bch2_extents_match(k, old))
+                       goto nomatch;
+
+               bkey_reassemble(_insert.k, k);
+               insert = _insert.k;
+
+               bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+               new = bkey_i_to_extent(_new.k);
+               bch2_cut_front(iter.pos, &new->k_i);
+
+               bch2_cut_front(iter.pos,        insert);
+               bch2_cut_back(new->k.p,         insert);
+               bch2_cut_back(insert->k.p,      &new->k_i);
+
+               /*
+                * @old: extent that we read from
+                * @insert: key that we're going to update, initialized from
+                * extent currently in btree - same as @old unless we raced with
+                * other updates
+                * @new: extent with new pointers that we'll be adding to @insert
+                *
+                * Fist, drop rewrite_ptrs from @new:
+                */
+               i = 0;
+               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+                       if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+                           bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
+                               /*
+                                * If we're going to be adding a pointer to the
+                                * same device, we have to drop the old one -
+                                * otherwise, we can just mark it cached:
+                                */
+                               if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
+                                       bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
+                               else
+                                       bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+                       }
+                       i++;
+               }
+
+
+               /* Add new ptrs: */
+               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
+                       if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+                               /*
+                                * raced with another move op? extent already
+                                * has a pointer to the device we just wrote
+                                * data to
+                                */
+                               continue;
+                       }
+
+                       bch2_extent_ptr_decoded_append(insert, &p);
+                       did_work = true;
+               }
+
+               if (!did_work)
+                       goto nomatch;
+
+               bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+               bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+               ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
+                                                &should_check_enospc,
+                                                &i_sectors_delta,
+                                                &disk_sectors_delta);
+               if (ret)
+                       goto err;
+
+               if (disk_sectors_delta > (s64) op->res.sectors) {
+                       ret = bch2_disk_reservation_add(c, &op->res,
+                                               disk_sectors_delta - op->res.sectors,
+                                               !should_check_enospc
+                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
+                       if (ret)
+                               goto out;
+               }
+
+               next_pos = insert->k.p;
+
+               ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+                                                 k.k->p, insert->k.p) ?:
+                       bch2_trans_update(&trans, &iter, insert,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+                       bch2_trans_commit(&trans, &op->res,
+                               op_journal_seq(op),
+                               BTREE_INSERT_NOFAIL|
+                               m->data_opts.btree_insert_flags);
+               if (!ret) {
+                       bch2_btree_iter_set_pos(&iter, next_pos);
+
+                       this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+                       trace_move_extent_finish(&new->k);
+               }
+err:
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       ret = 0;
+               if (ret)
+                       break;
+next:
+               while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+                       bch2_keylist_pop_front(keys);
+                       if (bch2_keylist_empty(keys))
+                               goto out;
+               }
+               continue;
+nomatch:
+               if (m->ctxt) {
+                       BUG_ON(k.k->p.offset <= iter.pos.offset);
+                       atomic64_inc(&m->ctxt->stats->keys_raced);
+                       atomic64_add(k.k->p.offset - iter.pos.offset,
+                                    &m->ctxt->stats->sectors_raced);
+               }
+
+               this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size);
+               trace_move_extent_race(&new->k);
+
+               bch2_btree_iter_advance(&iter);
+               goto next;
+       }
+out:
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&_insert, c);
+       bch2_bkey_buf_exit(&_new, c);
+       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+       return ret;
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+                               struct bch_extent_crc_unpacked crc,
+                               struct closure *cl)
+{
+       /* write bio must own pages: */
+       BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+       m->op.crc = crc;
+       m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+       closure_call(&m->op.cl, bch2_write, NULL, cl);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+       struct bch_fs *c = update->op.c;
+
+       bch2_bkey_buf_exit(&update->k, c);
+       bch2_disk_reservation_put(c, &update->op.res);
+       bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
+                         struct write_point_specifier wp,
+                         struct bch_io_opts io_opts,
+                         struct data_update_opts data_opts,
+                         enum btree_id btree_id,
+                         struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+       int ret;
+
+       bch2_bkey_buf_init(&m->k);
+       bch2_bkey_buf_reassemble(&m->k, c, k);
+       m->btree_id     = btree_id;
+       m->data_opts    = data_opts;
+
+       bch2_write_op_init(&m->op, c, io_opts);
+       m->op.pos       = bkey_start_pos(k.k);
+       m->op.version   = k.k->version;
+       m->op.target    = data_opts.target;
+       m->op.write_point = wp;
+       m->op.flags     |= BCH_WRITE_PAGES_STABLE|
+               BCH_WRITE_PAGES_OWNED|
+               BCH_WRITE_DATA_ENCODED|
+               BCH_WRITE_FROM_INTERNAL|
+               m->data_opts.write_flags;
+       m->op.compression_type =
+               bch2_compression_opt_to_type[io_opts.background_compression ?:
+                                            io_opts.compression];
+       if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+               m->op.alloc_reserve = RESERVE_movinggc;
+       m->op.index_update_fn   = bch2_data_update_index_update;
+
+       i = 0;
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+                   p.ptr.cached)
+                       BUG();
+
+               if (!((1U << i) & m->data_opts.rewrite_ptrs))
+                       bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+
+               if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+                   crc_is_compressed(p.crc))
+                       reserve_sectors += k.k->size;
+
+               /*
+                * op->csum_type is normally initialized from the fs/file's
+                * current options - but if an extent is encrypted, we require
+                * that it stays encrypted:
+                */
+               if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+                       m->op.nonce     = p.crc.nonce + p.crc.offset;
+                       m->op.csum_type = p.crc.csum_type;
+               }
+
+               if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+                       m->op.incompressible = true;
+
+               i++;
+       }
+
+       if (reserve_sectors) {
+               ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+                               m->data_opts.extra_replicas
+                               ? 0
+                               : BCH_DISK_RESERVATION_NOFAIL);
+               if (ret)
+                       return ret;
+       }
+
+       m->op.nr_replicas = m->op.nr_replicas_required =
+               hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
+
+       BUG_ON(!m->op.nr_replicas);
+       return 0;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+       unsigned i = 0;
+
+       bkey_for_each_ptr(ptrs, ptr) {
+               if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+                       opts->kill_ptrs |= 1U << i;
+                       opts->rewrite_ptrs ^= 1U << i;
+               }
+
+               i++;
+       }
+}
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
new file mode 100644 (file)
index 0000000..6793aa5
--- /dev/null
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+       unsigned        rewrite_ptrs;
+       unsigned        kill_ptrs;
+       u16             target;
+       u8              extra_replicas;
+       unsigned        btree_insert_flags;
+       unsigned        write_flags;
+};
+
+struct data_update {
+       /* extent being updated: */
+       enum btree_id           btree_id;
+       struct bkey_buf         k;
+       struct data_update_opts data_opts;
+       struct moving_context   *ctxt;
+       struct bch_write_op     op;
+};
+
+void bch2_data_update_read_done(struct data_update *,
+                               struct bch_extent_crc_unpacked,
+                               struct closure *);
+
+void bch2_data_update_exit(struct data_update *);
+int bch2_data_update_init(struct bch_fs *, struct data_update *,
+                         struct write_point_specifier,
+                         struct bch_io_opts, struct data_update_opts,
+                         enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
index ee5b7f6967965fa47f31136edb834af6d31e9264..57602c8e6c34a6ec64da4f2934ce420f2e25a4a0 100644 (file)
@@ -11,6 +11,7 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_locking.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "debug.h"
@@ -24,6 +25,7 @@
 #include <linux/console.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
+#include <linux/pretty-printers.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 
@@ -43,11 +45,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
        if (!bch2_dev_get_ioref(ca, READ))
                return false;
 
-       bio = bio_alloc_bioset(GFP_NOIO,
-                       buf_pages(n_sorted, btree_bytes(c)),
-                       &c->btree_bio);
-       bio_set_dev(bio, ca->disk_sb.bdev);
-       bio->bi_opf             = REQ_OP_READ|REQ_META;
+       bio = bio_alloc_bioset(ca->disk_sb.bdev,
+                              buf_pages(n_sorted, btree_bytes(c)),
+                              REQ_OP_READ|REQ_META,
+                              GFP_NOIO,
+                              &c->btree_bio);
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bch2_bio_map(bio, n_sorted, btree_bytes(c));
 
@@ -169,10 +171,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
                failed |= bch2_btree_verify_replica(c, b, p);
 
        if (failed) {
-               char buf[200];
+               struct printbuf buf = PRINTBUF;
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
-               bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+               bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+               printbuf_exit(&buf);
        }
 out:
        mutex_unlock(&c->verify_lock);
@@ -184,23 +187,24 @@ out:
 /* XXX: bch_fs refcounting */
 
 struct dump_iter {
-       struct bpos             from;
-       struct bch_fs   *c;
+       struct bch_fs           *c;
        enum btree_id           id;
+       struct bpos             from;
+       struct bpos             prev_node;
+       u64                     iter;
 
-       char                    buf[1 << 12];
-       size_t                  bytes;  /* what's currently in buf */
+       struct printbuf         buf;
 
        char __user             *ubuf;  /* destination user buffer */
        size_t                  size;   /* size of requested read */
        ssize_t                 ret;    /* bytes read so far */
 };
 
-static int flush_buf(struct dump_iter *i)
+static ssize_t flush_buf(struct dump_iter *i)
 {
-       if (i->bytes) {
-               size_t bytes = min(i->bytes, i->size);
-               int err = copy_to_user(i->ubuf, i->buf, bytes);
+       if (i->buf.pos) {
+               size_t bytes = min_t(size_t, i->buf.pos, i->size);
+               int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
 
                if (err)
                        return err;
@@ -208,11 +212,11 @@ static int flush_buf(struct dump_iter *i)
                i->ret   += bytes;
                i->ubuf  += bytes;
                i->size  -= bytes;
-               i->bytes -= bytes;
-               memmove(i->buf, i->buf + bytes, i->bytes);
+               i->buf.pos -= bytes;
+               memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
        }
 
-       return 0;
+       return i->size ? 0 : i->ret;
 }
 
 static int bch2_dump_open(struct inode *inode, struct file *file)
@@ -226,15 +230,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
 
        file->private_data = i;
        i->from = POS_MIN;
+       i->iter = 0;
        i->c    = container_of(bd, struct bch_fs, btree_debug[bd->id]);
        i->id   = bd->id;
+       i->buf  = PRINTBUF;
 
        return 0;
 }
 
 static int bch2_dump_release(struct inode *inode, struct file *file)
 {
-       kfree(file->private_data);
+       struct dump_iter *i = file->private_data;
+
+       printbuf_exit(&i->buf);
+       kfree(i);
        return 0;
 }
 
@@ -245,48 +254,33 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       int err;
+       ssize_t ret;
 
        i->ubuf = buf;
        i->size = size;
        i->ret  = 0;
 
-       err = flush_buf(i);
-       if (err)
-               return err;
-
-       if (!i->size)
-               return i->ret;
-
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(&iter);
-
-       while (k.k && !(err = bkey_err(k))) {
-               bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
-               i->bytes = strlen(i->buf);
-               BUG_ON(i->bytes >= sizeof(i->buf));
-               i->buf[i->bytes] = '\n';
-               i->bytes++;
-
-               k = bch2_btree_iter_next(&iter);
-               i->from = iter.pos;
-
-               err = flush_buf(i);
-               if (err)
+       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+                                 BTREE_ITER_PREFETCH|
+                                 BTREE_ITER_ALL_SNAPSHOTS, k, ({
+               ret = flush_buf(i);
+               if (ret)
                        break;
 
-               if (!i->size)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+               bch2_bkey_val_to_text(&i->buf, i->c, k);
+               prt_newline(&i->buf);
+               0;
+       }));
+       i->from = iter.pos;
+
+       if (!ret)
+               ret = flush_buf(i);
 
        bch2_trans_exit(&trans);
 
-       return err < 0 ? err : i->ret;
+       return ret ?: i->ret;
 }
 
 static const struct file_operations btree_debug_ops = {
@@ -303,44 +297,39 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
        struct btree_trans trans;
        struct btree_iter iter;
        struct btree *b;
-       int err;
+       ssize_t ret;
 
        i->ubuf = buf;
        i->size = size;
        i->ret  = 0;
 
-       err = flush_buf(i);
-       if (err)
-               return err;
+       ret = flush_buf(i);
+       if (ret)
+               return ret;
 
-       if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
+       if (!bpos_cmp(SPOS_MAX, i->from))
                return i->ret;
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
-               bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
-               i->bytes = strlen(i->buf);
-               err = flush_buf(i);
-               if (err)
+       for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+               ret = flush_buf(i);
+               if (ret)
                        break;
 
-               /*
-                * can't easily correctly restart a btree node traversal across
-                * all nodes, meh
-                */
+               bch2_btree_node_to_text(&i->buf, i->c, b);
                i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
                        ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
-
-               if (!i->size)
-                       break;
        }
        bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
-       return err < 0 ? err : i->ret;
+       if (!ret)
+               ret = flush_buf(i);
+
+       return ret ?: i->ret;
 }
 
 static const struct file_operations btree_format_debug_ops = {
@@ -357,75 +346,398 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct btree *prev_node = NULL;
-       int err;
+       ssize_t ret;
 
        i->ubuf = buf;
        i->size = size;
        i->ret  = 0;
 
-       err = flush_buf(i);
-       if (err)
-               return err;
-
-       if (!i->size)
-               return i->ret;
+       ret = flush_buf(i);
+       if (ret)
+               return ret;
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       while ((k = bch2_btree_iter_peek(&iter)).k &&
-              !(err = bkey_err(k))) {
+       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+                                 BTREE_ITER_PREFETCH|
+                                 BTREE_ITER_ALL_SNAPSHOTS, k, ({
                struct btree_path_level *l = &iter.path->l[0];
                struct bkey_packed *_k =
                        bch2_btree_node_iter_peek(&l->iter, l->b);
 
-               if (l->b != prev_node) {
-                       bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
-                       i->bytes = strlen(i->buf);
-                       err = flush_buf(i);
-                       if (err)
-                               break;
+               ret = flush_buf(i);
+               if (ret)
+                       break;
+
+               if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
+                       bch2_btree_node_to_text(&i->buf, i->c, l->b);
+                       i->prev_node = l->b->key.k.p;
                }
-               prev_node = l->b;
 
-               bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
-               i->bytes = strlen(i->buf);
+               bch2_bfloat_to_text(&i->buf, l->b, _k);
+               0;
+       }));
+       i->from = iter.pos;
+
+       bch2_trans_exit(&trans);
+
+       if (!ret)
+               ret = flush_buf(i);
+
+       return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+                                          struct btree *b)
+{
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 32);
+
+       prt_printf(out, "%px btree=%s l=%u ",
+              b,
+              bch2_btree_ids[b->c.btree_id],
+              b->c.level);
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+
+       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+       prt_newline(out);
+
+       prt_printf(out, "flags: ");
+       prt_tab(out);
+       prt_bitflags(out, bch2_btree_node_flags, b->flags);
+       prt_newline(out);
+
+       prt_printf(out, "pcpu read locks: ");
+       prt_tab(out);
+       prt_printf(out, "%u", b->c.lock.readers != NULL);
+       prt_newline(out);
+
+       prt_printf(out, "written:");
+       prt_tab(out);
+       prt_printf(out, "%u", b->written);
+       prt_newline(out);
+
+       prt_printf(out, "writes blocked:");
+       prt_tab(out);
+       prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+       prt_newline(out);
+
+       prt_printf(out, "will make reachable:");
+       prt_tab(out);
+       prt_printf(out, "%lx", b->will_make_reachable);
+       prt_newline(out);
+
+       prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+       prt_tab(out);
+       prt_printf(out, "%llu", b->writes[0].journal.seq);
+       prt_newline(out);
+
+       prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+       prt_tab(out);
+       prt_printf(out, "%llu", b->writes[1].journal.seq);
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+                                           size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       bool done = false;
+       ssize_t ret = 0;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       do {
+               struct bucket_table *tbl;
+               struct rhash_head *pos;
+               struct btree *b;
+
+               ret = flush_buf(i);
+               if (ret)
+                       return ret;
+
+               rcu_read_lock();
+               i->buf.atomic++;
+               tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+                                         &c->btree_cache.table);
+               if (i->iter < tbl->size) {
+                       rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+                               bch2_cached_btree_node_to_text(&i->buf, c, b);
+                       i->iter++;
+               } else {
+                       done = true;
+               }
+               --i->buf.atomic;
+               rcu_read_unlock();
+       } while (!done);
+
+       if (i->buf.allocation_failure)
+               ret = -ENOMEM;
+
+       if (!ret)
+               ret = flush_buf(i);
+
+       return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_cached_btree_nodes_read,
+};
+
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+                                           size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       struct btree_trans *trans;
+       ssize_t ret = 0;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       mutex_lock(&c->btree_trans_lock);
+       list_for_each_entry(trans, &c->btree_trans_list, list) {
+               if (trans->locking_wait.task->pid <= i->iter)
+                       continue;
+
+               ret = flush_buf(i);
+               if (ret)
+                       return ret;
+
+               bch2_btree_trans_to_text(&i->buf, trans);
+
+               prt_printf(&i->buf, "backtrace:");
+               prt_newline(&i->buf);
+               printbuf_indent_add(&i->buf, 2);
+               bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
+               printbuf_indent_sub(&i->buf, 2);
+               prt_newline(&i->buf);
+
+               i->iter = trans->locking_wait.task->pid;
+       }
+       mutex_unlock(&c->btree_trans_lock);
+
+       if (i->buf.allocation_failure)
+               ret = -ENOMEM;
+
+       if (!ret)
+               ret = flush_buf(i);
+
+       return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_btree_transactions_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+                                     size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       bool done = false;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       do {
                err = flush_buf(i);
                if (err)
+                       return err;
+
+               if (!i->size)
                        break;
 
-               bch2_btree_iter_advance(&iter);
-               i->from = iter.pos;
+               done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+               i->iter++;
+       } while (!done);
+
+       if (i->buf.allocation_failure)
+               return -ENOMEM;
+
+       return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+       struct bch_fs *c = inode->i_private;
+       struct dump_iter *i;
+
+       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+       if (!i)
+               return -ENOMEM;
+
+       i->iter = 0;
+       i->c    = c;
+       i->buf  = PRINTBUF;
+       file->private_data = i;
+
+       return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+       struct dump_iter *i = file->private_data;
+
+       printbuf_exit(&i->buf);
+       kfree(i);
+
+       return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+                                     size_t size, loff_t *ppos)
+{
+       struct dump_iter        *i = file->private_data;
+       struct bch_fs *c = i->c;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       while (1) {
+               struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
 
                err = flush_buf(i);
                if (err)
-                       break;
+                       return err;
 
                if (!i->size)
                        break;
+
+               if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+                   !bch2_btree_transaction_fns[i->iter])
+                       break;
+
+               prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+               prt_newline(&i->buf);
+               printbuf_indent_add(&i->buf, 2);
+
+               mutex_lock(&s->lock);
+
+               prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+               prt_newline(&i->buf);
+
+               if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+                       prt_printf(&i->buf, "Lock hold times:");
+                       prt_newline(&i->buf);
+
+                       printbuf_indent_add(&i->buf, 2);
+                       bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+                       printbuf_indent_sub(&i->buf, 2);
+               }
+
+               if (s->max_paths_text) {
+                       prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+                       prt_newline(&i->buf);
+
+                       printbuf_indent_add(&i->buf, 2);
+                       prt_str_indented(&i->buf, s->max_paths_text);
+                       printbuf_indent_sub(&i->buf, 2);
+               }
+
+               mutex_unlock(&s->lock);
+
+               printbuf_indent_sub(&i->buf, 2);
+               prt_newline(&i->buf);
+               i->iter++;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       bch2_trans_exit(&trans);
+       if (i->buf.allocation_failure)
+               return -ENOMEM;
 
-       return err < 0 ? err : i->ret;
+       return i->ret;
 }
 
-static const struct file_operations bfloat_failed_debug_ops = {
+static const struct file_operations lock_held_stats_op = {
+       .owner = THIS_MODULE,
+       .open = lock_held_stats_open,
+       .release = lock_held_stats_release,
+       .read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+                                           size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       struct btree_trans *trans;
+       ssize_t ret = 0;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       if (i->iter)
+               goto out;
+
+       mutex_lock(&c->btree_trans_lock);
+       list_for_each_entry(trans, &c->btree_trans_list, list) {
+               if (trans->locking_wait.task->pid <= i->iter)
+                       continue;
+
+               ret = flush_buf(i);
+               if (ret)
+                       return ret;
+
+               bch2_check_for_deadlock(trans, &i->buf);
+
+               i->iter = trans->locking_wait.task->pid;
+       }
+       mutex_unlock(&c->btree_trans_lock);
+out:
+       if (i->buf.allocation_failure)
+               ret = -ENOMEM;
+
+       if (!ret)
+               ret = flush_buf(i);
+
+       return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
        .owner          = THIS_MODULE,
        .open           = bch2_dump_open,
        .release        = bch2_dump_release,
-       .read           = bch2_read_bfloat_failed,
+       .read           = bch2_btree_deadlock_read,
 };
 
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
-       if (!IS_ERR_OR_NULL(c->debug))
-               debugfs_remove_recursive(c->debug);
+       if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+               debugfs_remove_recursive(c->fs_debug_dir);
 }
 
 void bch2_fs_debug_init(struct bch_fs *c)
@@ -437,29 +749,48 @@ void bch2_fs_debug_init(struct bch_fs *c)
                return;
 
        snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-       c->debug = debugfs_create_dir(name, bch_debug);
-       if (IS_ERR_OR_NULL(c->debug))
+       c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+       if (IS_ERR_OR_NULL(c->fs_debug_dir))
+               return;
+
+       debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+                           c->btree_debug, &cached_btree_nodes_ops);
+
+       debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+                           c->btree_debug, &btree_transactions_ops);
+
+       debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+                           c->btree_debug, &journal_pins_ops);
+
+       debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+                           c, &lock_held_stats_op);
+
+       debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+                           c->btree_debug, &btree_deadlock_ops);
+
+       c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+       if (IS_ERR_OR_NULL(c->btree_debug_dir))
                return;
 
        for (bd = c->btree_debug;
             bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
             bd++) {
                bd->id = bd - c->btree_debug;
-               bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
-                                               0400, c->debug, bd,
-                                               &btree_debug_ops);
+               debugfs_create_file(bch2_btree_ids[bd->id],
+                                   0400, c->btree_debug_dir, bd,
+                                   &btree_debug_ops);
 
                snprintf(name, sizeof(name), "%s-formats",
                         bch2_btree_ids[bd->id]);
 
-               bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
-                                                      &btree_format_debug_ops);
+               debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+                                   &btree_format_debug_ops);
 
                snprintf(name, sizeof(name), "%s-bfloat-failed",
                         bch2_btree_ids[bd->id]);
 
-               bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
-                                                &bfloat_failed_debug_ops);
+               debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+                                   &bfloat_failed_debug_ops);
        }
 }
 
index 6f699b736b348e366b6c1e43567cf7a65c78b8a4..288f46b55876b72bf0ca004904a6d67452956a90 100644 (file)
@@ -83,38 +83,58 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
        .is_visible     = dirent_is_visible,
 };
 
-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                       int rw, struct printbuf *err)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
        unsigned len;
 
-       if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
-               return "value too small";
+       if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*d.v));
+               return -EINVAL;
+       }
 
        len = bch2_dirent_name_bytes(d);
-       if (!len)
-               return "empty name";
+       if (!len) {
+               prt_printf(err, "empty name");
+               return -EINVAL;
+       }
 
-       if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
-               return "value too big";
+       if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
+               prt_printf(err, "value too big (%zu > %u)",
+                      bkey_val_u64s(k.k), dirent_val_u64s(len));
+               return -EINVAL;
+       }
 
-       if (len > BCH_NAME_MAX)
-               return "dirent name too big";
+       if (len > BCH_NAME_MAX) {
+               prt_printf(err, "dirent name too big (%u > %u)",
+                      len, BCH_NAME_MAX);
+               return -EINVAL;
+       }
 
-       if (len == 1 && !memcmp(d.v->d_name, ".", 1))
-               return "invalid name";
+       if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
+               prt_printf(err, "invalid name");
+               return -EINVAL;
+       }
 
-       if (len == 2 && !memcmp(d.v->d_name, "..", 2))
-               return "invalid name";
+       if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
+               prt_printf(err, "invalid name");
+               return -EINVAL;
+       }
 
-       if (memchr(d.v->d_name, '/', len))
-               return "invalid name";
+       if (memchr(d.v->d_name, '/', len)) {
+               prt_printf(err, "invalid name");
+               return -EINVAL;
+       }
 
        if (d.v->d_type != DT_SUBVOL &&
-           le64_to_cpu(d.v->d_inum) == d.k->p.inode)
-               return "dirent points to own directory";
+           le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
+               prt_printf(err, "dirent points to own directory");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
@@ -122,9 +142,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 
-       bch_scnmemcpy(out, d.v->d_name,
-                     bch2_dirent_name_bytes(d));
-       pr_buf(out, " -> %llu type %s",
+       prt_printf(out, "%.*s -> %llu type %s",
+              bch2_dirent_name_bytes(d),
+              d.v->d_name,
               d.v->d_type != DT_SUBVOL
               ? le64_to_cpu(d.v->d_inum)
               : le32_to_cpu(d.v->d_child_subvol),
@@ -451,7 +471,7 @@ retry:
 
        ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
                                          name, inum, 0);
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (!ret)
                bch2_trans_iter_exit(&trans, &iter);
@@ -470,16 +490,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
        if (ret)
                return ret;
 
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
-                          SPOS(dir.inum, 0, snapshot), 0, k, ret) {
-               if (k.k->p.inode > dir.inum)
-                       break;
-
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+                          SPOS(dir.inum, 0, snapshot),
+                          POS(dir.inum, U64_MAX), 0, k, ret)
                if (k.k->type == KEY_TYPE_dirent) {
                        ret = -ENOTEMPTY;
                        break;
                }
-       }
        bch2_trans_iter_exit(trans, &iter);
 
        return ret;
@@ -503,11 +520,9 @@ retry:
        if (ret)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
-                          SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
-               if (k.k->p.inode > inum.inum)
-                       break;
-
+       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+                          SPOS(inum.inum, ctx->pos, snapshot),
+                          POS(inum.inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_dirent)
                        continue;
 
@@ -541,7 +556,7 @@ retry:
        }
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
index 1bb4d802bc1db1ea8bb79a454074a51afb595324..b1466932c76873c2d326f9d87507195e26fcb769 100644 (file)
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent (struct bkey_ops) {       \
index 6c84297ef265f0810f71f3288a8d8fbab69e936e..6b81f35861aca327dd6b50e42b2194edf8fc7cec 100644 (file)
@@ -39,13 +39,13 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
                g = BCH_MEMBER_GROUP(m) - 1;
 
                if (g >= nr_groups) {
-                       pr_buf(err, "disk %u has invalid label %u (have %u)",
+                       prt_printf(err, "disk %u has invalid label %u (have %u)",
                               i, g, nr_groups);
                        return -EINVAL;
                }
 
                if (BCH_GROUP_DELETED(&groups->entries[g])) {
-                       pr_buf(err, "disk %u has deleted label %u", i, g);
+                       prt_printf(err, "disk %u has deleted label %u", i, g);
                        return -EINVAL;
                }
        }
@@ -61,7 +61,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
                len = strnlen(g->label, sizeof(g->label));
                if (!len) {
-                       pr_buf(err, "label %u empty", i);
+                       prt_printf(err, "label %u empty", i);
                        return -EINVAL;
                }
        }
@@ -76,8 +76,9 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
        for (g = sorted; g + 1 < sorted + nr_groups; g++)
                if (!BCH_GROUP_DELETED(g) &&
                    !group_cmp(&g[0], &g[1])) {
-                       pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
-                       bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
+                       prt_printf(err, "duplicate label %llu.%.*s",
+                              BCH_GROUP_PARENT(g),
+                              (int) sizeof(g->label), g->label);
                        goto err;
                }
 
@@ -100,12 +101,12 @@ static void bch2_sb_disk_groups_to_text(struct printbuf *out,
             g < groups->entries + nr_groups;
             g++) {
                if (g != groups->entries)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
 
                if (BCH_GROUP_DELETED(g))
-                       pr_buf(out, "[deleted]");
+                       prt_printf(out, "[deleted]");
                else
-                       pr_buf(out, "[parent %llu name %s]",
+                       prt_printf(out, "[parent %llu name %s]",
                               BCH_GROUP_PARENT(g), g->label);
        }
 }
@@ -275,7 +276,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 
                groups = bch2_sb_resize_disk_groups(sb, u64s);
                if (!groups)
-                       return -ENOSPC;
+                       return -BCH_ERR_ENOSPC_disk_label_add;
 
                nr_groups = disk_groups_nr(groups);
        }
@@ -342,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
        return v;
 }
 
-void bch2_disk_path_to_text(struct printbuf *out,
-                           struct bch_sb_handle *sb,
-                           unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
        struct bch_sb_field_disk_groups *groups =
-               bch2_sb_get_disk_groups(sb->sb);
+               bch2_sb_get_disk_groups(sb);
        struct bch_disk_group *g;
        unsigned nr = 0;
        u16 path[32];
@@ -376,43 +375,43 @@ void bch2_disk_path_to_text(struct printbuf *out,
                v = path[--nr];
                g = groups->entries + v;
 
-               bch_scnmemcpy(out, g->label,
-                             strnlen(g->label, sizeof(g->label)));
-
+               prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
                if (nr)
-                       pr_buf(out, ".");
+                       prt_printf(out, ".");
        }
        return;
 inval:
-       pr_buf(out, "invalid group %u", v);
+       prt_printf(out, "invalid label %u", v);
 }
 
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
        struct bch_member *mi;
-       int v = -1;
-       int ret = 0;
-
-       mutex_lock(&c->sb_lock);
+       int ret, v = -1;
 
        if (!strlen(name) || !strcmp(name, "none"))
-               goto write_sb;
+               return 0;
 
        v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-       if (v < 0) {
-               mutex_unlock(&c->sb_lock);
+       if (v < 0)
                return v;
-       }
 
        ret = bch2_sb_disk_groups_to_cpu(c);
        if (ret)
-               goto unlock;
-write_sb:
+               return ret;
+
        mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
        SET_BCH_MEMBER_GROUP(mi, v + 1);
+       return 0;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+       int ret;
 
-       bch2_write_super(c);
-unlock:
+       mutex_lock(&c->sb_lock);
+       ret = __bch2_dev_group_set(c, ca, name) ?:
+               bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
        return ret;
@@ -448,41 +447,57 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
        return -EINVAL;
 }
 
-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out,
+                            struct bch_fs *c,
+                            struct bch_sb *sb,
+                            u64 v)
 {
        struct target t = target_decode(v);
 
        switch (t.type) {
        case TARGET_NULL:
-               pr_buf(out, "none");
+               prt_printf(out, "none");
                break;
-       case TARGET_DEV: {
-               struct bch_dev *ca;
-
-               rcu_read_lock();
-               ca = t.dev < c->sb.nr_devices
-                       ? rcu_dereference(c->devs[t.dev])
-                       : NULL;
-
-               if (ca && percpu_ref_tryget(&ca->io_ref)) {
-                       char b[BDEVNAME_SIZE];
-
-                       pr_buf(out, "/dev/%s",
-                            bdevname(ca->disk_sb.bdev, b));
-                       percpu_ref_put(&ca->io_ref);
-               } else if (ca) {
-                       pr_buf(out, "offline device %u", t.dev);
+       case TARGET_DEV:
+               if (c) {
+                       struct bch_dev *ca;
+
+                       rcu_read_lock();
+                       ca = t.dev < c->sb.nr_devices
+                               ? rcu_dereference(c->devs[t.dev])
+                               : NULL;
+
+                       if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                               prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+                               percpu_ref_put(&ca->io_ref);
+                       } else if (ca) {
+                               prt_printf(out, "offline device %u", t.dev);
+                       } else {
+                               prt_printf(out, "invalid device %u", t.dev);
+                       }
+
+                       rcu_read_unlock();
                } else {
-                       pr_buf(out, "invalid device %u", t.dev);
+                       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+                       struct bch_member *m = mi->members + t.dev;
+
+                       if (bch2_dev_exists(sb, mi, t.dev)) {
+                               prt_printf(out, "Device ");
+                               pr_uuid(out, m->uuid.b);
+                               prt_printf(out, " (%u)", t.dev);
+                       } else {
+                               prt_printf(out, "Bad device %u", t.dev);
+                       }
                }
-
-               rcu_read_unlock();
                break;
-       }
        case TARGET_GROUP:
-               mutex_lock(&c->sb_lock);
-               bch2_disk_path_to_text(out, &c->disk_sb, t.group);
-               mutex_unlock(&c->sb_lock);
+               if (c) {
+                       mutex_lock(&c->sb_lock);
+                       bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+                       mutex_unlock(&c->sb_lock);
+               } else {
+                       bch2_disk_path_to_text(out, sb, t.group);
+               }
                break;
        default:
                BUG();
index 3d84f23c34ed42ab2b13e7be4beafddceb3f7fbd..e4470c357a66b8ee6ae2d72526a558e23edee249 100644 (file)
@@ -75,14 +75,14 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 /* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
-                           unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
 
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 
 const char *bch2_sb_validate_disk_groups(struct bch_sb *,
index 9b45640e75dc1ed6194b969cc1eca9ecccdf3465..dfe37965d5165c6d823b99a253a04b9f6870528c 100644 (file)
@@ -4,6 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "bset.h"
 #include "btree_gc.h"
@@ -102,24 +103,34 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                       int rw, struct printbuf *err)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
-       if (!bkey_cmp(k.k->p, POS_MIN))
-               return "stripe at pos 0";
+       if (!bkey_cmp(k.k->p, POS_MIN)) {
+               prt_printf(err, "stripe at POS_MIN");
+               return -EINVAL;
+       }
 
-       if (k.k->p.inode)
-               return "invalid stripe key";
+       if (k.k->p.inode) {
+               prt_printf(err, "nonzero inode field");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) < sizeof(*s))
-               return "incorrect value size";
+       if (bkey_val_bytes(k.k) < sizeof(*s)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*s));
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) < sizeof(*s) ||
-           bkey_val_u64s(k.k) < stripe_val_u64s(s))
-               return "incorrect value size";
+       if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
+               prt_printf(err, "incorrect value size (%zu < %u)",
+                      bkey_val_u64s(k.k), stripe_val_u64s(s));
+               return -EINVAL;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -128,7 +139,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
        unsigned i;
 
-       pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+       prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
               s->algorithm,
               le16_to_cpu(s->sectors),
               s->nr_blocks - s->nr_redundant,
@@ -137,7 +148,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
               1U << s->csum_granularity_bits);
 
        for (i = 0; i < s->nr_blocks; i++)
-               pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+               prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
                       (u64) s->ptrs[i].offset,
                       stripe_blockcount_get(s, i));
 }
@@ -286,14 +297,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                        struct bch_csum got = ec_block_checksum(buf, i, offset);
 
                        if (bch2_crc_cmp(want, got)) {
-                               char buf2[200];
+                               struct printbuf buf2 = PRINTBUF;
 
-                               bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+                               bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
 
                                bch_err_ratelimited(c,
                                        "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
                                        (void *) _RET_IP_, i, j, v->csum_type,
-                                       want.lo, got.lo, buf2);
+                                       want.lo, got.lo, buf2.buf);
+                               printbuf_exit(&buf2);
                                clear_bit(i, buf->valid);
                                break;
                        }
@@ -401,7 +413,10 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
                                   nr_iovecs << PAGE_SHIFT);
                struct ec_bio *ec_bio;
 
-               ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
+               ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+                                                      nr_iovecs,
+                                                      rw,
+                                                      GFP_KERNEL,
                                                       &c->ec_bioset),
                                      struct ec_bio, bio);
 
@@ -409,9 +424,6 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
                ec_bio->buf                     = buf;
                ec_bio->idx                     = idx;
 
-               bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
-               bio_set_op_attrs(&ec_bio->bio, rw, 0);
-
                ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
                ec_bio->bio.bi_end_io           = ec_block_endio;
                ec_bio->bio.bi_private          = cl;
@@ -561,18 +573,14 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
                               struct btree_iter *iter)
 {
        size_t idx = iter->pos.offset;
-       int ret = 0;
 
        if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
-               return ret;
+               return 0;
 
        bch2_trans_unlock(trans);
-       ret = -EINTR;
 
-       if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
-               return ret;
-
-       return -ENOMEM;
+       return   __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
+               bch2_trans_relock(trans);
 }
 
 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
@@ -715,7 +723,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
        struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
        int ret;
 
-       for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
                        if (start_pos.offset) {
@@ -724,17 +732,18 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
                                continue;
                        }
 
-                       ret = -ENOSPC;
+                       ret = -BCH_ERR_ENOSPC_stripe_create;
                        break;
                }
 
                if (bkey_deleted(k.k))
-                       goto found_slot;
+                       break;
        }
 
-       goto err;
-found_slot:
-       start_pos = iter.pos;
+       c->ec_stripe_hint = iter.pos.offset;
+
+       if (ret)
+               goto err;
 
        ret = ec_stripe_mem_alloc(trans, &iter);
        if (ret)
@@ -743,8 +752,6 @@ found_slot:
        stripe->k.p = iter.pos;
 
        ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
-
-       c->ec_stripe_hint = start_pos.offset;
 err:
        bch2_trans_iter_exit(trans, &iter);
 
@@ -811,78 +818,111 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
        };
 }
 
-static int ec_stripe_update_ptrs(struct bch_fs *c,
-                                struct ec_stripe_buf *s,
-                                struct bkey *pos)
+static int ec_stripe_update_extent(struct btree_trans *trans,
+                                  struct btree_iter *iter,
+                                  struct bkey_s_c k,
+                                  struct ec_stripe_buf *s)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_s_extent e;
-       struct bkey_buf sk;
-       struct bpos next_pos;
-       int ret = 0, dev, block;
+       const struct bch_extent_ptr *ptr_c;
+       struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+       struct bkey_i *n;
+       int ret, dev, block;
+
+       if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+               return 0;
+
+       ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+       /*
+        * It doesn't generally make sense to erasure code cached ptrs:
+        * XXX: should we be incrementing a counter?
+        */
+       if (!ptr_c || ptr_c->cached)
+               return 0;
+
+       dev = s->key.v.ptrs[block].dev;
+
+       n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
+
+       bkey_reassemble(n, k);
 
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+       bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+       ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+       BUG_ON(!ec_ptr);
 
-       /* XXX this doesn't support the reflink btree */
+       extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            bkey_start_pos(pos),
-                            BTREE_ITER_INTENT);
+       return bch2_trans_update(trans, iter, n, 0);
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+                                  unsigned block)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_extent_ptr bucket = s->key.v.ptrs[block];
+       struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+       struct bch_backpointer bp;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 bp_offset = 0;
+       int ret = 0;
 retry:
-       while (bch2_trans_begin(&trans),
-              (k = bch2_btree_iter_peek(&iter)).k &&
-              !(ret = bkey_err(k)) &&
-              bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
-               const struct bch_extent_ptr *ptr_c;
-               struct bch_extent_ptr *ptr, *ec_ptr = NULL;
-
-               if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-                       bch2_btree_iter_advance(&iter);
-                       continue;
+       while (1) {
+               bch2_trans_begin(trans);
+
+               ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen,
+                                               &bp_offset, &bp,
+                                               BTREE_ITER_CACHED);
+               if (ret)
+                       break;
+               if (bp_offset == U64_MAX)
+                       break;
+
+               if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) {
+                       ret = -EIO;
+                       break;
                }
 
-               ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
-               /*
-                * It doesn't generally make sense to erasure code cached ptrs:
-                * XXX: should we be incrementing a counter?
-                */
-               if (!ptr_c || ptr_c->cached) {
-                       bch2_btree_iter_advance(&iter);
+               k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+               if (!k.k)
                        continue;
-               }
 
-               dev = s->key.v.ptrs[block].dev;
+               ret = ec_stripe_update_extent(trans, &iter, k, s);
+               bch2_trans_iter_exit(trans, &iter);
+               if (ret)
+                       break;
 
-               bch2_bkey_buf_reassemble(&sk, c, k);
-               e = bkey_i_to_s_extent(sk.k);
+               bp_offset++;
+       }
 
-               bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
-               ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
-               BUG_ON(!ec_ptr);
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
 
-               extent_stripe_ptr_add(e, s, ec_ptr, block);
+       return ret;
+}
 
-               bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
-               next_pos = sk.k->k.p;
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+       struct btree_trans trans;
+       struct bch_stripe *v = &s->key.v;
+       unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+       int ret = 0;
 
-               ret   = bch2_btree_iter_traverse(&iter) ?:
-                       bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-                       bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
-               if (!ret)
-                       bch2_btree_iter_set_pos(&iter, next_pos);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for (i = 0; i < nr_data; i++) {
+               ret = ec_stripe_update_bucket(&trans, s, i);
                if (ret)
                        break;
        }
-       if (ret == -EINTR)
-               goto retry;
-       bch2_trans_iter_exit(&trans, &iter);
+
 
        bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -894,7 +934,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 {
        struct bch_fs *c = s->c;
        struct open_bucket *ob;
-       struct bkey_i *k;
        struct stripe *m;
        struct bch_stripe *v = &s->new_stripe.key.v;
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
@@ -928,7 +967,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        BUG_ON(!s->allocated);
 
-       if (!percpu_ref_tryget(&c->writes))
+       if (!percpu_ref_tryget_live(&c->writes))
                goto err;
 
        ec_generate_ec(&s->new_stripe);
@@ -954,13 +993,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err_put_writes;
        }
 
-       for_each_keylist_key(&s->keys, k) {
-               ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
-               if (ret) {
-                       bch_err(c, "error creating stripe: error %i updating pointers", ret);
-                       break;
-               }
-       }
+       ret = ec_stripe_update_extents(c, &s->new_stripe);
+       if (ret)
+               bch_err(c, "error creating stripe: error updating pointers: %s",
+                       bch2_err_str(ret));
 
        spin_lock(&c->ec_stripes_heap_lock);
        m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
@@ -985,8 +1021,6 @@ err:
                        }
                }
 
-       bch2_keylist_free(&s->keys, s->inline_keys);
-
        ec_stripe_buf_exit(&s->existing_stripe);
        ec_stripe_buf_exit(&s->new_stripe);
        closure_debug_destroy(&s->iodone);
@@ -1069,30 +1103,6 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
        return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
-void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
-                            struct bkey *k)
-{
-       struct ec_stripe_new *ec = ob->ec;
-
-       if (!ec)
-               return;
-
-       mutex_lock(&ec->lock);
-
-       if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
-                                ARRAY_SIZE(ec->inline_keys),
-                                BKEY_U64s)) {
-               BUG();
-       }
-
-       bkey_init(&ec->keys.top->k);
-       ec->keys.top->k.p       = k->p;
-       ec->keys.top->k.size    = k->size;
-       bch2_keylist_push(&ec->keys);
-
-       mutex_unlock(&ec->lock);
-}
-
 static int unsigned_cmp(const void *_l, const void *_r)
 {
        unsigned l = *((const unsigned *) _l);
@@ -1185,8 +1195,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
                                BCH_BKEY_PTRS_MAX) - h->redundancy;
        s->nr_parity    = h->redundancy;
 
-       bch2_keylist_init(&s->keys, s->inline_keys);
-
        ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
                           s->nr_parity, h->blocksize);
 
@@ -1294,9 +1302,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
        BUG_ON(nr_have_data     > h->s->nr_data);
        BUG_ON(nr_have_parity   > h->s->nr_parity);
 
-       percpu_down_read(&c->mark_lock);
-       rcu_read_lock();
-
        buckets.nr = 0;
        if (nr_have_parity < h->s->nr_parity) {
                ret = bch2_bucket_alloc_set(c, &buckets,
@@ -1306,8 +1311,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                                            &nr_have_parity,
                                            &have_cache,
                                            h->copygc
-                                           ? RESERVE_MOVINGGC
-                                           : RESERVE_NONE,
+                                           ? RESERVE_movinggc
+                                           : RESERVE_none,
                                            0,
                                            cl);
 
@@ -1323,7 +1328,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                }
 
                if (ret)
-                       goto err;
+                       return ret;
        }
 
        buckets.nr = 0;
@@ -1335,8 +1340,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                                            &nr_have_data,
                                            &have_cache,
                                            h->copygc
-                                           ? RESERVE_MOVINGGC
-                                           : RESERVE_NONE,
+                                           ? RESERVE_movinggc
+                                           : RESERVE_none,
                                            0,
                                            cl);
 
@@ -1351,12 +1356,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                }
 
                if (ret)
-                       goto err;
+                       return ret;
        }
-err:
-       rcu_read_unlock();
-       percpu_up_read(&c->mark_lock);
-       return ret;
+
+       return 0;
 }
 
 /* XXX: doesn't obey target: */
@@ -1402,10 +1405,8 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
        int ret;
 
        idx = get_existing_stripe(c, h);
-       if (idx < 0) {
-               bch_err(c, "failed to find an existing stripe");
-               return -ENOSPC;
-       }
+       if (idx < 0)
+               return -BCH_ERR_ENOSPC_stripe_reuse;
 
        h->s->have_existing_stripe = true;
        ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@@ -1443,21 +1444,9 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
                                                        struct ec_stripe_head *h)
 {
-       int ret;
-
-       ret = bch2_disk_reservation_get(c, &h->s->res,
-                       h->blocksize,
-                       h->s->nr_parity, 0);
-
-       if (ret) {
-               /*
-                * This means we need to wait for copygc to
-                * empty out buckets from existing stripes:
-                */
-               bch_err(c, "failed to reserve stripe");
-       }
-
-       return ret;
+       return bch2_disk_reservation_get(c, &h->s->res,
+                                        h->blocksize,
+                                        h->s->nr_parity, 0);
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
@@ -1499,8 +1488,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
                ret = __bch2_ec_stripe_head_reserve(c, h);
        if (ret && needs_stripe_new)
                ret = __bch2_ec_stripe_head_reuse(c, h);
-       if (ret)
+       if (ret) {
+               bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
                goto err;
+       }
 
        if (!h->s->allocated) {
                ret = new_stripe_alloc_buckets(c, h, cl);
@@ -1616,7 +1607,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
        for (i = 0; i < min_t(size_t, h->used, 20); i++) {
                m = genradix_ptr(&c->stripes, h->data[i].idx);
 
-               pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+               prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
                       h->data[i].blocks_nonempty,
                       m->nr_blocks - m->nr_redundant,
                       m->nr_redundant);
@@ -1631,11 +1622,11 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->ec_stripe_head_lock);
        list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-               pr_buf(out, "target %u algo %u redundancy %u:\n",
+               prt_printf(out, "target %u algo %u redundancy %u:\n",
                       h->target, h->algo, h->redundancy);
 
                if (h->s)
-                       pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+                       prt_printf(out, "\tpending: blocks %u+%u allocated %u\n",
                               h->s->nr_data, h->s->nr_parity,
                               bitmap_weight(h->s->blocks_allocated,
                                             h->s->nr_data));
@@ -1644,7 +1635,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->ec_stripe_new_lock);
        list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-               pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+               prt_printf(out, "\tin flight: blocks %u+%u pin %u\n",
                       s->nr_data, s->nr_parity,
                       atomic_read(&s->pin));
        }
@@ -1676,11 +1667,14 @@ void bch2_fs_ec_exit(struct bch_fs *c)
        bioset_exit(&c->ec_bioset);
 }
 
-int bch2_fs_ec_init(struct bch_fs *c)
+void bch2_fs_ec_init_early(struct bch_fs *c)
 {
        INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
        INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
 
+int bch2_fs_ec_init(struct bch_fs *c)
+{
        return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
                           BIOSET_NEED_BVECS);
 }
index 78d468c7680a2f167070297392dbcabba1204f95..3e2b22c00a3e7adf19f166b1254452c79af001d7 100644 (file)
@@ -4,9 +4,9 @@
 
 #include "ec_types.h"
 #include "buckets_types.h"
-#include "keylist_types.h"
 
-const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+                       int rw, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
                         struct bkey_s_c);
 
@@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
        .key_invalid    = bch2_stripe_invalid,          \
        .val_to_text    = bch2_stripe_to_text,          \
        .swab           = bch2_ptr_swab,                \
+       .trans_trigger  = bch2_trans_mark_stripe,       \
+       .atomic_trigger = bch2_mark_stripe,             \
 }
 
 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
@@ -163,9 +165,6 @@ struct ec_stripe_new {
        open_bucket_idx_t       blocks[BCH_BKEY_PTRS_MAX];
        struct disk_reservation res;
 
-       struct keylist          keys;
-       u64                     inline_keys[BKEY_U64s * 8];
-
        struct ec_stripe_buf    new_stripe;
        struct ec_stripe_buf    existing_stripe;
 };
@@ -193,8 +192,6 @@ struct ec_stripe_head {
 int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
-                            struct bkey *);
 
 void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
@@ -221,6 +218,7 @@ void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
 int bch2_fs_ec_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_EC_H */
diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c
new file mode 100644 (file)
index 0000000..cc9ce0b
--- /dev/null
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+       BCH_ERRCODES()
+#undef x
+       NULL
+};
+
+#define BCH_ERR_0      0
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+       BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+       const char *errstr;
+       err = abs(err);
+
+       BUG_ON(err >= BCH_ERR_MAX);
+
+       if (err >= BCH_ERR_START)
+               errstr = bch2_errcode_strs[err - BCH_ERR_START];
+       else if (err)
+               errstr = errname(err);
+       else
+               errstr = "(No error)";
+       return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+       err     = abs(err);
+       class   = abs(class);
+
+       BUG_ON(err      >= BCH_ERR_MAX);
+       BUG_ON(class    >= BCH_ERR_MAX);
+
+       while (err >= BCH_ERR_START && err != class)
+               err = bch2_errcode_parents[err - BCH_ERR_START];
+
+       return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+       err = -err;
+       BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+       while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+               err = bch2_errcode_parents[err - BCH_ERR_START];
+
+       return -err;
+}
index f7d12915c1ccda5f7674dcb9fca7098a030f2bba..9f293040b25384d6823c4ccedc669ae63fbd4b71 100644 (file)
@@ -2,11 +2,96 @@
 #ifndef _BCACHEFS_ERRCODE_H
 #define _BCACHEFS_ERRCODE_H
 
-enum {
-       /* Bucket allocator: */
-       OPEN_BUCKETS_EMPTY =    2048,
-       FREELIST_EMPTY,         /* Allocator thread not keeping up */
-       INSUFFICIENT_DEVICES,
+#define BCH_ERRCODES()                                                         \
+       x(ENOSPC,                       ENOSPC_disk_reservation)                \
+       x(ENOSPC,                       ENOSPC_bucket_alloc)                    \
+       x(ENOSPC,                       ENOSPC_disk_label_add)                  \
+       x(ENOSPC,                       ENOSPC_stripe_create)                   \
+       x(ENOSPC,                       ENOSPC_stripe_reuse)                    \
+       x(ENOSPC,                       ENOSPC_inode_create)                    \
+       x(ENOSPC,                       ENOSPC_str_hash_create)                 \
+       x(ENOSPC,                       ENOSPC_snapshot_create)                 \
+       x(ENOSPC,                       ENOSPC_subvolume_create)                \
+       x(ENOSPC,                       ENOSPC_sb)                              \
+       x(ENOSPC,                       ENOSPC_sb_journal)                      \
+       x(ENOSPC,                       ENOSPC_sb_quota)                        \
+       x(ENOSPC,                       ENOSPC_sb_replicas)                     \
+       x(ENOSPC,                       ENOSPC_sb_members)                      \
+       x(0,                            open_buckets_empty)                     \
+       x(0,                            freelist_empty)                         \
+       x(BCH_ERR_freelist_empty,       no_buckets_found)                       \
+       x(0,                            insufficient_devices)                   \
+       x(0,                            transaction_restart)                    \
+       x(BCH_ERR_transaction_restart,  transaction_restart_fault_inject)       \
+       x(BCH_ERR_transaction_restart,  transaction_restart_relock)             \
+       x(BCH_ERR_transaction_restart,  transaction_restart_relock_path)        \
+       x(BCH_ERR_transaction_restart,  transaction_restart_relock_path_intent) \
+       x(BCH_ERR_transaction_restart,  transaction_restart_relock_after_fill)  \
+       x(BCH_ERR_transaction_restart,  transaction_restart_too_many_iters)     \
+       x(BCH_ERR_transaction_restart,  transaction_restart_lock_node_reused)   \
+       x(BCH_ERR_transaction_restart,  transaction_restart_fill_relock)        \
+       x(BCH_ERR_transaction_restart,  transaction_restart_fill_mem_alloc_fail)\
+       x(BCH_ERR_transaction_restart,  transaction_restart_mem_realloced)      \
+       x(BCH_ERR_transaction_restart,  transaction_restart_in_traverse_all)    \
+       x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock)     \
+       x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock_write)\
+       x(BCH_ERR_transaction_restart,  transaction_restart_deadlock_recursion_limit)\
+       x(BCH_ERR_transaction_restart,  transaction_restart_upgrade)            \
+       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_upgrade)  \
+       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_fill)     \
+       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_raced)    \
+       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_realloced)\
+       x(BCH_ERR_transaction_restart,  transaction_restart_journal_preres_get) \
+       x(BCH_ERR_transaction_restart,  transaction_restart_split_race)         \
+       x(BCH_ERR_transaction_restart,  transaction_restart_nested)             \
+       x(0,                            no_btree_node)                          \
+       x(BCH_ERR_no_btree_node,        no_btree_node_relock)                   \
+       x(BCH_ERR_no_btree_node,        no_btree_node_upgrade)                  \
+       x(BCH_ERR_no_btree_node,        no_btree_node_drop)                     \
+       x(BCH_ERR_no_btree_node,        no_btree_node_lock_root)                \
+       x(BCH_ERR_no_btree_node,        no_btree_node_up)                       \
+       x(BCH_ERR_no_btree_node,        no_btree_node_down)                     \
+       x(BCH_ERR_no_btree_node,        no_btree_node_init)                     \
+       x(BCH_ERR_no_btree_node,        no_btree_node_cached)                   \
+       x(0,                            backpointer_to_overwritten_btree_node)  \
+       x(0,                            lock_fail_root_changed)                 \
+       x(0,                            journal_reclaim_would_deadlock)         \
+       x(0,                            fsck)                                   \
+       x(BCH_ERR_fsck,                 fsck_fix)                               \
+       x(BCH_ERR_fsck,                 fsck_ignore)                            \
+       x(BCH_ERR_fsck,                 fsck_errors_not_fixed)                  \
+       x(BCH_ERR_fsck,                 fsck_repair_unimplemented)              \
+       x(BCH_ERR_fsck,                 fsck_repair_impossible)                 \
+       x(0,                            need_snapshot_cleanup)                  \
+       x(0,                            need_topology_repair)
+
+enum bch_errcode {
+       BCH_ERR_START           = 2048,
+#define x(class, err) BCH_ERR_##err,
+       BCH_ERRCODES()
+#undef x
+       BCH_ERR_MAX
 };
 
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+       return err && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class)                 \
+({                                                     \
+       BUILD_BUG_ON(!__builtin_constant_p(_class));    \
+       _bch2_err_matches(_err, _class);                \
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+       return err < 0 ? __bch2_err_class(err) : err;
+}
+
 #endif /* _BCACHFES_ERRCODE_H */
index 8279a9ba76a5c5e91524512d31e966e566cb240e..2fb5102ee31d16da84e83a37b32a0a0872254284 100644 (file)
@@ -68,103 +68,138 @@ void bch2_io_error(struct bch_dev *ca)
 #include "tools-util.h"
 #endif
 
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
-                               const char *fmt, ...)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
 {
-       struct fsck_err_state *s = NULL;
-       va_list args;
-       bool fix = false, print = true, suppressing = false;
-       char _buf[sizeof(s->buf)], *buf = _buf;
-
-       if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
-               va_start(args, fmt);
-               vprintk(fmt, args);
-               va_end(args);
-
-               if (c->opts.errors == BCH_ON_ERROR_continue) {
-                       bch_err(c, "fixing");
-                       return FSCK_ERR_FIX;
-               } else {
-                       bch2_inconsistent_error(c);
-                       return FSCK_ERR_EXIT;
-               }
-       }
+       struct fsck_err_state *s;
 
-       mutex_lock(&c->fsck_error_lock);
+       if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+               return NULL;
 
        list_for_each_entry(s, &c->fsck_errors, list)
-               if (s->fmt == fmt)
-                       goto found;
+               if (s->fmt == fmt) {
+                       /*
+                        * move it to the head of the list: repeated fsck errors
+                        * are common
+                        */
+                       list_move(&s->list, &c->fsck_errors);
+                       return s;
+               }
 
        s = kzalloc(sizeof(*s), GFP_NOFS);
        if (!s) {
                if (!c->fsck_alloc_err)
                        bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
                c->fsck_alloc_err = true;
-               buf = _buf;
-               goto print;
+               return NULL;
        }
 
        INIT_LIST_HEAD(&s->list);
        s->fmt = fmt;
-found:
-       list_move(&s->list, &c->fsck_errors);
-       s->nr++;
-       if (c->opts.ratelimit_errors &&
-           !(flags & FSCK_NO_RATELIMIT) &&
-           s->nr >= FSCK_ERR_RATELIMIT_NR) {
-               if (s->nr == FSCK_ERR_RATELIMIT_NR)
-                       suppressing = true;
-               else
-                       print = false;
+       s->buf = PRINTBUF;
+       list_add(&s->list, &c->fsck_errors);
+       return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+{
+       struct fsck_err_state *s = NULL;
+       va_list args;
+       bool print = true, suppressing = false, inconsistent = false;
+       struct printbuf buf = PRINTBUF, *out = &buf;
+       int ret = -BCH_ERR_fsck_ignore;
+
+       mutex_lock(&c->fsck_error_lock);
+       s = fsck_err_get(c, fmt);
+       if (s) {
+               if (c->opts.ratelimit_errors &&
+                   !(flags & FSCK_NO_RATELIMIT) &&
+                   s->nr >= FSCK_ERR_RATELIMIT_NR) {
+                       if (s->nr == FSCK_ERR_RATELIMIT_NR)
+                               suppressing = true;
+                       else
+                               print = false;
+               }
+
+               printbuf_reset(&s->buf);
+               out = &s->buf;
+               s->nr++;
        }
-       buf             = s->buf;
-print:
+
+       if (!strncmp(fmt, "bcachefs:", 9))
+               prt_printf(out, bch2_log_msg(c, ""));
+
        va_start(args, fmt);
-       vscnprintf(buf, sizeof(_buf), fmt, args);
+       prt_vprintf(out, fmt, args);
        va_end(args);
 
-       if (c->opts.fix_errors == FSCK_OPT_EXIT) {
-               bch_err(c, "%s, exiting", buf);
+       if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+               if (c->opts.errors != BCH_ON_ERROR_continue ||
+                   !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+                       prt_str(out, ", shutting down");
+                       inconsistent = true;
+                       ret = -BCH_ERR_fsck_errors_not_fixed;
+               } else if (flags & FSCK_CAN_FIX) {
+                       prt_str(out, ", fixing");
+                       ret = -BCH_ERR_fsck_fix;
+               } else {
+                       prt_str(out, ", continuing");
+                       ret = -BCH_ERR_fsck_ignore;
+               }
+       } else if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+               prt_str(out, ", exiting");
+               ret = -BCH_ERR_fsck_errors_not_fixed;
        } else if (flags & FSCK_CAN_FIX) {
                if (c->opts.fix_errors == FSCK_OPT_ASK) {
-                       printk(KERN_ERR "%s: fix?", buf);
-                       fix = ask_yn();
+                       prt_str(out, ": fix?");
+                       bch2_print_string_as_lines(KERN_ERR, out->buf);
+                       print = false;
+                       ret = ask_yn()
+                               ? -BCH_ERR_fsck_fix
+                               : -BCH_ERR_fsck_ignore;
                } else if (c->opts.fix_errors == FSCK_OPT_YES ||
                           (c->opts.nochanges &&
                            !(flags & FSCK_CAN_IGNORE))) {
-                       if (print)
-                               bch_err(c, "%s, fixing", buf);
-                       fix = true;
+                       prt_str(out, ", fixing");
+                       ret = -BCH_ERR_fsck_fix;
                } else {
-                       if (print)
-                               bch_err(c, "%s, not fixing", buf);
-                       fix = false;
+                       prt_str(out, ", not fixing");
                }
        } else if (flags & FSCK_NEED_FSCK) {
-               if (print)
-                       bch_err(c, "%s (run fsck to correct)", buf);
+               prt_str(out, " (run fsck to correct)");
        } else {
-               if (print)
-                       bch_err(c, "%s (repair unimplemented)", buf);
+               prt_str(out, " (repair unimplemented)");
        }
 
-       if (suppressing)
+       if (ret == -BCH_ERR_fsck_ignore &&
+           (c->opts.fix_errors == FSCK_OPT_EXIT ||
+            !(flags & FSCK_CAN_IGNORE)))
+               ret = -BCH_ERR_fsck_errors_not_fixed;
+
+       if (print)
+               bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+       if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+           (ret != -BCH_ERR_fsck_fix &&
+            ret != -BCH_ERR_fsck_ignore))
+               bch_err(c, "Unable to continue, halting");
+       else if (suppressing)
                bch_err(c, "Ratelimiting new instances of previous error");
 
        mutex_unlock(&c->fsck_error_lock);
 
-       if (fix) {
+       printbuf_exit(&buf);
+
+       if (inconsistent)
+               bch2_inconsistent_error(c);
+
+       if (ret == -BCH_ERR_fsck_fix) {
                set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
-               return FSCK_ERR_FIX;
        } else {
                set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
                set_bit(BCH_FS_ERROR, &c->flags);
-               return c->opts.fix_errors == FSCK_OPT_EXIT ||
-                       !(flags & FSCK_CAN_IGNORE)
-                       ? FSCK_ERR_EXIT
-                       : FSCK_ERR_IGNORE;
        }
+
+       return ret;
 }
 
 void bch2_flush_fsck_errs(struct bch_fs *c)
@@ -175,9 +210,10 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 
        list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
                if (s->ratelimited)
-                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf.buf);
 
                list_del(&s->list);
+               printbuf_exit(&s->buf);
                kfree(s);
        }
 
index 986938298adc4d4e8e5e35c2a7555e2f338741c6..bbf9b6d85b4dcf55d794afd88ed52d228e40eba8 100644 (file)
@@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *);
 
 #define bch2_fs_inconsistent_on(cond, c, ...)                          \
 ({                                                                     \
-       int _ret = !!(cond);                                            \
+       bool _ret = unlikely(!!(cond));                                 \
                                                                        \
        if (_ret)                                                       \
                bch2_fs_inconsistent(c, __VA_ARGS__);                   \
@@ -59,26 +59,38 @@ do {                                                                        \
 
 #define bch2_dev_inconsistent_on(cond, ca, ...)                                \
 ({                                                                     \
-       int _ret = !!(cond);                                            \
+       bool _ret = unlikely(!!(cond));                                 \
                                                                        \
        if (_ret)                                                       \
                bch2_dev_inconsistent(ca, __VA_ARGS__);                 \
        _ret;                                                           \
 })
 
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...)                            \
+({                                                                     \
+       bch_err(trans->c, __VA_ARGS__);                                 \
+       bch2_inconsistent_error(trans->c);                              \
+       bch2_dump_trans_updates(trans);                                 \
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...)                   \
+({                                                                     \
+       bool _ret = unlikely(!!(cond));                                 \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_trans_inconsistent(trans, __VA_ARGS__);            \
+       _ret;                                                           \
+})
+
 /*
  * Fsck errors: inconsistency errors we detect at mount time, and should ideally
  * be able to repair:
  */
 
-enum {
-       BCH_FSCK_OK                     = 0,
-       BCH_FSCK_ERRORS_NOT_FIXED       = 1,
-       BCH_FSCK_REPAIR_UNIMPLEMENTED   = 2,
-       BCH_FSCK_REPAIR_IMPOSSIBLE      = 3,
-       BCH_FSCK_UNKNOWN_VERSION        = 4,
-};
-
 enum fsck_err_opts {
        FSCK_OPT_EXIT,
        FSCK_OPT_YES,
@@ -86,19 +98,12 @@ enum fsck_err_opts {
        FSCK_OPT_ASK,
 };
 
-enum fsck_err_ret {
-       FSCK_ERR_IGNORE = 0,
-       FSCK_ERR_FIX    = 1,
-       FSCK_ERR_EXIT   = 2,
-       FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
-};
-
 struct fsck_err_state {
        struct list_head        list;
        const char              *fmt;
        u64                     nr;
        bool                    ratelimited;
-       char                    buf[512];
+       struct printbuf         buf;
 };
 
 #define FSCK_CAN_FIX           (1 << 0)
@@ -107,21 +112,20 @@ struct fsck_err_state {
 #define FSCK_NO_RATELIMIT      (1 << 3)
 
 __printf(3, 4) __cold
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
-                               unsigned, const char *, ...);
+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
 void bch2_flush_fsck_errs(struct bch_fs *);
 
 #define __fsck_err(c, _flags, msg, ...)                                        \
 ({                                                                     \
-       int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+       int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);        \
                                                                        \
-       if (_fix == FSCK_ERR_EXIT) {                                    \
-               bch_err(c, "Unable to continue, halting");              \
-               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
+       if (_ret != -BCH_ERR_fsck_fix &&                                \
+           _ret != -BCH_ERR_fsck_ignore) {                             \
+               ret = _ret;                                             \
                goto fsck_err;                                          \
        }                                                               \
                                                                        \
-       _fix;                                                           \
+       _ret == -BCH_ERR_fsck_fix;                                      \
 })
 
 /* These macros return true if error should be fixed: */
@@ -129,7 +133,7 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 
 #define __fsck_err_on(cond, c, _flags, ...)                            \
-       ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+       (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
 
 #define need_fsck_err_on(cond, c, ...)                                 \
        __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
@@ -164,7 +168,7 @@ do {                                                                        \
 
 #define bch2_fs_fatal_err_on(cond, c, ...)                             \
 ({                                                                     \
-       int _ret = !!(cond);                                            \
+       bool _ret = unlikely(!!(cond));                                 \
                                                                        \
        if (_ret)                                                       \
                bch2_fs_fatal_error(c, __VA_ARGS__);                    \
index 58b2c96f450c9ba8a4787431665e5e42b7b9833f..2fd5d9672a44287b42a10acfcdc81c5865830c07 100644 (file)
@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       unsigned ret = 0;
+       unsigned ret = 0, lru = 0;
 
        bkey_extent_entry_for_each(ptrs, entry) {
                switch (__extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
+                       /* Might also be updating LRU btree */
+                       if (entry->ptr.cached)
+                               lru++;
+
+                       fallthrough;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        ret++;
                }
        }
 
-       return ret;
+       /*
+        * Updating keys in the alloc btree may also update keys in the
+        * freespace or discard btrees:
+        */
+       return lru + ret * 2;
 }
 
 static int count_iters_for_insert(struct btree_trans *trans,
index 44c584e9adaa8691a6335e10b70dab66e9a2bd0d..9e2a4ed48b42bedf70f373c564452d3a66c3b4f8 100644 (file)
@@ -26,6 +26,8 @@
 
 #include <trace/events/bcachefs.h>
 
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+
 static unsigned bch2_crc_field_size_max[] = {
        [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
        [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -156,12 +158,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
-               return "value too big";
+       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+               prt_printf(err, "value too big (%zu > %u)",
+                      bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+               return -EINVAL;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -170,35 +176,45 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                             int rw, struct printbuf *err)
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-       if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
-               return "value too small";
+       if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
+               prt_printf(err, "value too small (%zu <= %zu)",
+                      bkey_val_bytes(k.k), sizeof(*bp.v));
+               return -EINVAL;
+       }
 
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-               return "value too big";
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+               prt_printf(err, "value too big (%zu > %zu)",
+                      bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+               return -EINVAL;
+       }
 
        if (c->sb.version < bcachefs_metadata_version_snapshot &&
-           bp.v->min_key.snapshot)
-               return "invalid min_key.snapshot";
+           bp.v->min_key.snapshot) {
+               prt_printf(err, "invalid min_key.snapshot (%u != 0)",
+                      bp.v->min_key.snapshot);
+               return -EINVAL;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
+                              struct bkey_s_c k)
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-       pr_buf(out, "seq %llx written %u min_key %s",
+       prt_printf(out, "seq %llx written %u min_key %s",
               le64_to_cpu(bp.v->seq),
               le16_to_cpu(bp.v->sectors_written),
               BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
 
        bch2_bpos_to_text(out, bp.v->min_key);
-       pr_buf(out, " ");
+       prt_printf(out, " ");
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
@@ -220,17 +236,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 
 /* KEY_TYPE_extent: */
 
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-                        struct bkey_s_c k)
-{
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
        struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
@@ -287,7 +292,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
                    lp.crc.uncompressed_size) {
                        /* can use left extent's crc entry */
-               } else if (lp.crc.live_size <= rp.crc.offset ) {
+               } else if (lp.crc.live_size <= rp.crc.offset) {
                        /* can use right extent's crc entry */
                } else {
                        /* check if checksums can be merged: */
@@ -305,8 +310,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                            lp.crc.uncompressed_size +
                            rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
                                return false;
+               }
+
+               en_l = extent_entry_next(en_l);
+               en_r = extent_entry_next(en_r);
+       }
 
-                       if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+       en_l = l_ptrs.start;
+       en_r = r_ptrs.start;
+       while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+               if (extent_entry_is_crc(en_l)) {
+                       struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+                       struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+                       if (crc_l.uncompressed_size + crc_r.uncompressed_size >
                            bch2_crc_field_size_max[extent_entry_type(en_l)])
                                return false;
                }
@@ -334,7 +351,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                        if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
                            crc_l.uncompressed_size) {
                                /* can use left extent's crc entry */
-                       } else if (crc_l.live_size <= crc_r.offset ) {
+                       } else if (crc_l.live_size <= crc_r.offset) {
                                /* can use right extent's crc entry */
                                crc_r.offset -= crc_l.live_size;
                                bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
@@ -363,17 +380,24 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                            int rw, struct printbuf *err)
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-               return "incorrect value size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(*r.v));
+               return -EINVAL;
+       }
 
-       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-               return "invalid nr_replicas";
+       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+               prt_printf(err, "invalid nr_replicas (%u)",
+                      r.v->nr_replicas);
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -381,7 +405,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-       pr_buf(out, "generation %u replicas %u",
+       prt_printf(out, "generation %u replicas %u",
               le32_to_cpu(r.v->generation),
               r.v->nr_replicas);
 }
@@ -666,37 +690,6 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
        return durability;
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-                                   unsigned target,
-                                   unsigned nr_desired_replicas)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
-       if (target && extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
-
-                       if (n && n <= extra &&
-                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
-               }
-
-       if (extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
-
-                       if (n && n <= extra) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
-               }
-}
-
 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
 {
        union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
@@ -800,8 +793,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
-                                            struct bch_extent_ptr *ptr)
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+                                          struct bch_extent_ptr *ptr)
 {
        struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
        union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -873,6 +866,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
        bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+       struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+
+       if (ptr)
+               __bch2_bkey_drop_ptr(k, ptr);
+}
+
 const struct bch_extent_ptr *
 bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
 {
@@ -917,6 +918,44 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
        return false;
 }
 
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+       struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+       const union bch_extent_entry *entry1, *entry2;
+       struct extent_ptr_decoded p1, p2;
+
+       bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+               bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+                       if (p1.ptr.dev          == p2.ptr.dev &&
+                           p1.ptr.gen          == p2.ptr.gen &&
+                           (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                           (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                               return true;
+
+       return false;
+}
+
+bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
+                        struct bkey_s_c k2)
+{
+       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+       const union bch_extent_entry *entry2;
+       struct extent_ptr_decoded p2;
+
+       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+               if (p1.ptr.dev          == p2.ptr.dev &&
+                   p1.ptr.gen          == p2.ptr.gen &&
+                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                       return true;
+
+       return false;
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -949,27 +988,37 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
        bkey_extent_entry_for_each(ptrs, entry) {
                if (!first)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
                        ptr = entry_to_ptr(entry);
-                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                       ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
-                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-                              (u64) ptr->offset, ptr->gen,
-                              ptr->cached ? " cached" : "",
-                              ca && ptr_stale(ca, ptr)
-                              ? " stale" : "");
+                       if (!ca) {
+                               prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+                                      (u64) ptr->offset, ptr->gen,
+                                      ptr->cached ? " cached" : "");
+                       } else {
+                               u32 offset;
+                               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+                               prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
+                                      b, offset, ptr->gen,
+                                      ptr->cached ? " cached" : "");
+
+                               if (ca && ptr_stale(ca, ptr))
+                                       prt_printf(out, " stale");
+                       }
                        break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128:
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+                       prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
                               crc.uncompressed_size,
                               crc.offset, crc.nonce,
@@ -979,11 +1028,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        ec = &entry->stripe_ptr;
 
-                       pr_buf(out, "ec: idx %llu block %u",
+                       prt_printf(out, "ec: idx %llu block %u",
                               (u64) ec->idx, ec->block);
                        break;
                default:
-                       pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+                       prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
                        return;
                }
 
@@ -991,69 +1040,88 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c k,
-                                     const struct bch_extent_ptr *ptr,
-                                     unsigned size_ondisk,
-                                     bool metadata)
+static int extent_ptr_invalid(const struct bch_fs *c,
+                             struct bkey_s_c k,
+                             const struct bch_extent_ptr *ptr,
+                             unsigned size_ondisk,
+                             bool metadata,
+                             struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr2;
+       u64 bucket;
+       u32 bucket_offset;
        struct bch_dev *ca;
 
-       if (!bch2_dev_exists2(c, ptr->dev))
-               return "pointer to invalid device";
+       if (!bch2_dev_exists2(c, ptr->dev)) {
+               prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
+               return -EINVAL;
+       }
 
        ca = bch_dev_bkey_exists(c, ptr->dev);
-       if (!ca)
-               return "pointer to invalid device";
-
        bkey_for_each_ptr(ptrs, ptr2)
-               if (ptr != ptr2 && ptr->dev == ptr2->dev)
-                       return "multiple pointers to same device";
+               if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+                       prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
+                       return -EINVAL;
+               }
 
-       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-               return "offset past end of device";
+       bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
-       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-               return "offset before first bucket";
+       if (bucket >= ca->mi.nbuckets) {
+               prt_printf(err, "pointer past last bucket (%llu > %llu)",
+                      bucket, ca->mi.nbuckets);
+               return -EINVAL;
+       }
 
-       if (bucket_remainder(ca, ptr->offset) +
-           size_ondisk > ca->mi.bucket_size)
-               return "spans multiple buckets";
+       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+               prt_printf(err, "pointer before first bucket (%llu < %u)",
+                      bucket, ca->mi.first_bucket);
+               return -EINVAL;
+       }
 
-       return NULL;
+       if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+               prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+                      bucket_offset, size_ondisk, ca->mi.bucket_size);
+               return -EINVAL;
+       }
+
+       return 0;
 }
 
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_devs_list devs;
        const union bch_extent_entry *entry;
        struct bch_extent_crc_unpacked crc;
        unsigned size_ondisk = k.k->size;
-       const char *reason;
        unsigned nonce = UINT_MAX;
-       unsigned i;
+       unsigned nr_ptrs = 0;
+       int ret;
 
-       if (k.k->type == KEY_TYPE_btree_ptr ||
-           k.k->type == KEY_TYPE_btree_ptr_v2)
+       if (bkey_is_btree_ptr(k.k))
                size_ondisk = btree_sectors(c);
 
        bkey_extent_entry_for_each(ptrs, entry) {
-               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-                       return "invalid extent entry type";
+               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+                       prt_printf(err, "invalid extent entry type (got %u, max %u)",
+                              __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+                       return -EINVAL;
+               }
 
-               if (k.k->type == KEY_TYPE_btree_ptr &&
-                   !extent_entry_is_ptr(entry))
-                       return "has non ptr field";
+               if (bkey_is_btree_ptr(k.k) &&
+                   !extent_entry_is_ptr(entry)) {
+                       prt_printf(err, "has non ptr field");
+                       return -EINVAL;
+               }
 
                switch (extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
-                       reason = extent_ptr_invalid(c, k, &entry->ptr,
-                                                   size_ondisk, false);
-                       if (reason)
-                               return reason;
+                       ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
+                                                false, err);
+                       if (ret)
+                               return ret;
+                       nr_ptrs++;
                        break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
@@ -1061,22 +1129,30 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
                        if (crc.offset + crc.live_size >
-                           crc.uncompressed_size)
-                               return "checksum offset + key size > uncompressed size";
+                           crc.uncompressed_size) {
+                               prt_printf(err, "checksum offset + key size > uncompressed size");
+                               return -EINVAL;
+                       }
 
                        size_ondisk = crc.compressed_size;
 
-                       if (!bch2_checksum_type_valid(c, crc.csum_type))
-                               return "invalid checksum type";
+                       if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+                               prt_printf(err, "invalid checksum type");
+                               return -EINVAL;
+                       }
 
-                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
-                               return "invalid compression type";
+                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+                               prt_printf(err, "invalid compression type");
+                               return -EINVAL;
+                       }
 
                        if (bch2_csum_type_is_encryption(crc.csum_type)) {
                                if (nonce == UINT_MAX)
                                        nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce)
-                                       return "incorrect nonce";
+                               else if (nonce != crc.offset + crc.nonce) {
+                                       prt_printf(err, "incorrect nonce");
+                                       return -EINVAL;
+                               }
                        }
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
@@ -1084,13 +1160,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
                }
        }
 
-       devs = bch2_bkey_devs(k);
-       bubble_sort(devs.devs, devs.nr, u8_cmp);
-       for (i = 0; i + 1 < devs.nr; i++)
-               if (devs.devs[i] == devs.devs[i + 1])
-                       return "multiple ptrs to same device";
+       if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+               prt_str(err, "too many ptrs");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_ptr_swab(struct bkey_s k)
index 9c2567274a2b8d286707d6b1b3594b3d04007ac3..3c17b81130bbfbdba9130dc3c224597115e4cf4d 100644 (file)
@@ -367,13 +367,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
-                           struct bkey_s_c);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
                              int, struct bkey_s);
 
@@ -381,6 +380,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
        .key_invalid    = bch2_btree_ptr_invalid,               \
        .val_to_text    = bch2_btree_ptr_to_text,               \
        .swab           = bch2_ptr_swab,                        \
+       .trans_trigger  = bch2_trans_mark_extent,               \
+       .atomic_trigger = bch2_mark_extent,                     \
 }
 
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {         \
@@ -388,25 +389,28 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
        .val_to_text    = bch2_btree_ptr_v2_to_text,            \
        .swab           = bch2_ptr_swab,                        \
        .compat         = bch2_btree_ptr_v2_compat,             \
+       .trans_trigger  = bch2_trans_mark_extent,               \
+       .atomic_trigger = bch2_mark_extent,                     \
 }
 
 /* KEY_TYPE_extent: */
 
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {               \
-       .key_invalid    = bch2_extent_invalid,                  \
-       .val_to_text    = bch2_extent_to_text,                  \
+       .key_invalid    = bch2_bkey_ptrs_invalid,               \
+       .val_to_text    = bch2_bkey_ptrs_to_text,               \
        .swab           = bch2_ptr_swab,                        \
        .key_normalize  = bch2_extent_normalize,                \
        .key_merge      = bch2_extent_merge,                    \
+       .trans_trigger  = bch2_trans_mark_extent,               \
+       .atomic_trigger = bch2_mark_extent,                     \
 }
 
 /* KEY_TYPE_reservation: */
 
-const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+                            int, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -414,6 +418,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
        .key_invalid    = bch2_reservation_invalid,             \
        .val_to_text    = bch2_reservation_to_text,             \
        .key_merge      = bch2_reservation_merge,               \
+       .trans_trigger  = bch2_trans_mark_reservation,          \
+       .atomic_trigger = bch2_mark_reservation,                \
 }
 
 /* Extent checksum entries: */
@@ -571,15 +577,10 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
-                                   unsigned, unsigned);
-
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
-                                            struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
 
@@ -601,16 +602,20 @@ do {                                                                      \
 } while (0)
 
 void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
 const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
 bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
 
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
                           struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+                          int, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
index d543480be111796702d9f5c5f2ced168f4815e16..1f2e1fc4f6b22292e4932b9f82a45767460647a4 100644 (file)
@@ -204,12 +204,19 @@ int bch2_link_trans(struct btree_trans *trans,
                goto err;
 
        inode_u->bi_ctime = now;
-       bch2_inode_nlink_inc(inode_u);
+       ret = bch2_inode_nlink_inc(inode_u);
+       if (ret)
+               return ret;
 
        ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
+       if (bch2_reinherit_attrs(inode_u, dir_u)) {
+               ret = -EXDEV;
+               goto err;
+       }
+
        dir_u->bi_mtime = dir_u->bi_ctime = now;
 
        dir_hash = bch2_hash_info_init(c, dir_u);
@@ -297,7 +304,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
                if (ret)
                        goto err;
        } else {
-               bch2_inode_nlink_dec(inode_u);
+               bch2_inode_nlink_dec(trans, inode_u);
        }
 
        if (inode_u->bi_dir             == dirent_iter.pos.inode &&
@@ -462,7 +469,7 @@ int bch2_rename_trans(struct btree_trans *trans,
        }
 
        if (mode == BCH_RENAME_OVERWRITE)
-               bch2_inode_nlink_dec(dst_inode_u);
+               bch2_inode_nlink_dec(trans, dst_inode_u);
 
        src_dir_u->bi_mtime             = now;
        src_dir_u->bi_ctime             = now;
@@ -480,11 +487,11 @@ int bch2_rename_trans(struct btree_trans *trans,
        ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
                (src_dir.inum != dst_dir.inum
                 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-                : 0 ) ?:
+                : 0) ?:
                bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
                (dst_inum.inum
                 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-                : 0 );
+                : 0);
 err:
        bch2_trans_iter_exit(trans, &dst_inode_iter);
        bch2_trans_iter_exit(trans, &src_inode_iter);
index 1d0871f63e4e71402874373a982262d74b00c24c..706180b97a7711138bfb9af59952c748fad6c7a3 100644 (file)
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
 
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return true;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return true;
+       return false;
+}
+
 static inline struct address_space *faults_disabled_mapping(void)
 {
        return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
@@ -142,7 +151,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 static int bch2_quota_reservation_add(struct bch_fs *c,
                                      struct bch_inode_info *inode,
                                      struct quota_res *res,
-                                     unsigned sectors,
+                                     u64 sectors,
                                      bool check_enospc)
 {
        int ret;
@@ -223,7 +232,10 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
                return;
 
        mutex_lock(&inode->ei_quota_lock);
-       BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+                               inode->ei_inode.bi_sectors);
        inode->v.i_blocks += sectors;
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -397,7 +409,7 @@ retry:
        offset = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        bch2_trans_exit(&trans);
 
@@ -422,22 +434,20 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
 {
        pgoff_t index = start >> PAGE_SECTORS_SHIFT;
        pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct pagevec pvec;
+       struct folio_batch fbatch;
+       unsigned i, j;
 
        if (end <= start)
                return;
 
-       pagevec_init(&pvec);
+       folio_batch_init(&fbatch);
 
-       do {
-               unsigned nr_pages, i, j;
-
-               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
-                                               &index, end_index);
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
-                       u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+                       u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
                        unsigned pg_offset = max(start, pg_start) - pg_start;
                        unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
                        struct bch_page_state *s;
@@ -446,8 +456,8 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
                        BUG_ON(pg_offset >= PAGE_SECTORS);
                        BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
 
-                       lock_page(page);
-                       s = bch2_page_state(page);
+                       folio_lock(folio);
+                       s = bch2_page_state(&folio->page);
 
                        if (s) {
                                spin_lock(&s->lock);
@@ -456,10 +466,11 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
                                spin_unlock(&s->lock);
                        }
 
-                       unlock_page(page);
+                       folio_unlock(folio);
                }
-               pagevec_release(&pvec);
-       } while (index <= end_index);
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
 }
 
 static void mark_pagecache_reserved(struct bch_inode_info *inode,
@@ -468,23 +479,21 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        pgoff_t index = start >> PAGE_SECTORS_SHIFT;
        pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct pagevec pvec;
+       struct folio_batch fbatch;
        s64 i_sectors_delta = 0;
+       unsigned i, j;
 
        if (end <= start)
                return;
 
-       pagevec_init(&pvec);
-
-       do {
-               unsigned nr_pages, i, j;
+       folio_batch_init(&fbatch);
 
-               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
-                                               &index, end_index);
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
-                       u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+                       u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
                        unsigned pg_offset = max(start, pg_start) - pg_start;
                        unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
                        struct bch_page_state *s;
@@ -493,8 +502,8 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
                        BUG_ON(pg_offset >= PAGE_SECTORS);
                        BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
 
-                       lock_page(page);
-                       s = bch2_page_state(page);
+                       folio_lock(folio);
+                       s = bch2_page_state(&folio->page);
 
                        if (s) {
                                spin_lock(&s->lock);
@@ -513,10 +522,11 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
                                spin_unlock(&s->lock);
                        }
 
-                       unlock_page(page);
+                       folio_unlock(folio);
                }
-               pagevec_release(&pvec);
-       } while (index <= end_index);
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
 
        i_sectors_acct(c, inode, NULL, i_sectors_delta);
 }
@@ -596,7 +606,7 @@ static void bch2_page_reservation_put(struct bch_fs *c,
 static int bch2_page_reservation_get(struct bch_fs *c,
                        struct bch_inode_info *inode, struct page *page,
                        struct bch2_page_reservation *res,
-                       unsigned offset, unsigned len, bool check_enospc)
+                       unsigned offset, unsigned len)
 {
        struct bch_page_state *s = bch2_page_state_create(page, 0);
        unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -616,19 +626,14 @@ static int bch2_page_reservation_get(struct bch_fs *c,
        }
 
        if (disk_sectors) {
-               ret = bch2_disk_reservation_add(c, &res->disk,
-                                               disk_sectors,
-                                               !check_enospc
-                                               ? BCH_DISK_RESERVATION_NOFAIL
-                                               : 0);
+               ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
                if (unlikely(ret))
                        return ret;
        }
 
        if (quota_sectors) {
                ret = bch2_quota_reservation_add(c, inode, &res->quota,
-                                                quota_sectors,
-                                                check_enospc);
+                                                quota_sectors, true);
                if (unlikely(ret)) {
                        struct disk_reservation tmp = {
                                .sectors = disk_sectors
@@ -812,7 +817,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
                }
        }
 
-       if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
+       if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
                unlock_page(page);
                ret = VM_FAULT_SIGBUS;
                goto out;
@@ -830,47 +835,22 @@ out:
        return ret;
 }
 
-void bch2_invalidatepage(struct page *page, unsigned int offset,
-                        unsigned int length)
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 {
-       if (offset || length < PAGE_SIZE)
+       if (offset || length < folio_size(folio))
                return;
 
-       bch2_clear_page_bits(page);
+       bch2_clear_page_bits(&folio->page);
 }
 
-int bch2_releasepage(struct page *page, gfp_t gfp_mask)
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 {
-       if (PageDirty(page))
-               return 0;
-
-       bch2_clear_page_bits(page);
-       return 1;
-}
-
-#ifdef CONFIG_MIGRATION
-int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
-                     struct page *page, enum migrate_mode mode)
-{
-       int ret;
-
-       EBUG_ON(!PageLocked(page));
-       EBUG_ON(!PageLocked(newpage));
-
-       ret = migrate_page_move_mapping(mapping, newpage, page, 0);
-       if (ret != MIGRATEPAGE_SUCCESS)
-               return ret;
+       if (folio_test_dirty(folio) || folio_test_writeback(folio))
+               return false;
 
-       if (PagePrivate(page))
-               attach_page_private(newpage, detach_page_private(page));
-
-       if (mode != MIGRATE_SYNC_NO_COPY)
-               migrate_page_copy(newpage, page);
-       else
-               migrate_page_states(newpage, page);
-       return MIGRATEPAGE_SUCCESS;
+       bch2_clear_page_bits(&folio->page);
+       return true;
 }
-#endif
 
 /* readpage(s): */
 
@@ -1034,10 +1014,9 @@ retry:
                 * read_extent -> io_time_reset may cause a transaction restart
                 * without returning an error, we need to check for that here:
                 */
-               if (!bch2_trans_relock(trans)) {
-                       ret = -EINTR;
+               ret = bch2_trans_relock(trans);
+               if (ret)
                        break;
-               }
 
                bch2_btree_iter_set_pos(&iter,
                                POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -1090,7 +1069,7 @@ retry:
 err:
        bch2_trans_iter_exit(trans, &iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (ret) {
@@ -1127,12 +1106,12 @@ void bch2_readahead(struct readahead_control *ractl)
                                   readpages_iter.idx,
                                   BIO_MAX_VECS);
                struct bch_read_bio *rbio =
-                       rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+                                                  GFP_NOFS, &c->bio_read),
                                  opts);
 
                readpages_iter.idx++;
 
-               bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
                rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
@@ -1164,20 +1143,6 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
        bch2_trans_exit(&trans);
 }
 
-int bch2_readpage(struct file *file, struct page *page)
-{
-       struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
-       struct bch_read_bio *rbio;
-
-       rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
-       rbio->bio.bi_end_io = bch2_readpages_end_io;
-
-       __bchfs_readpage(c, rbio, inode_inum(inode), page);
-       return 0;
-}
-
 static void bch2_read_single_page_end_io(struct bio *bio)
 {
        complete(bio->bi_private);
@@ -1192,7 +1157,7 @@ static int bch2_read_single_page(struct page *page,
        int ret;
        DECLARE_COMPLETION_ONSTACK(done);
 
-       rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
+       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
                         io_opts(c, &inode->ei_inode));
        rbio->bio.bi_private = &done;
        rbio->bio.bi_end_io = bch2_read_single_page_end_io;
@@ -1210,6 +1175,16 @@ static int bch2_read_single_page(struct page *page,
        return 0;
 }
 
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+       struct page *page = &folio->page;
+       int ret;
+
+       ret = bch2_read_single_page(page, page->mapping);
+       folio_unlock(folio);
+       return bch2_err_class(ret);
+}
+
 /* writepages: */
 
 struct bch_writepage_state {
@@ -1243,8 +1218,6 @@ static void bch2_writepage_io_done(struct closure *cl)
        struct bio_vec *bvec;
        unsigned i;
 
-       up(&io->op.c->io_in_flight);
-
        if (io->op.error) {
                set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 
@@ -1278,7 +1251,7 @@ static void bch2_writepage_io_done(struct closure *cl)
         * racing with fallocate can cause us to add fewer sectors than
         * expected - but we shouldn't add more sectors than expected:
         */
-       WARN_ON(io->op.i_sectors_delta > 0);
+       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
 
        /*
         * (error (due to going RO) halfway through a page can screw that up
@@ -1307,8 +1280,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 {
        struct bch_writepage_io *io = w->io;
 
-       down(&io->op.c->io_in_flight);
-
        w->io = NULL;
        closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
        continue_at(&io->cl, bch2_writepage_io_done, NULL);
@@ -1327,7 +1298,9 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 {
        struct bch_write_op *op;
 
-       w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS,
+       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+                                             REQ_OP_WRITE,
+                                             GFP_NOFS,
                                              &c->writepage_bioset),
                             struct bch_writepage_io, op.wbio.bio);
 
@@ -1464,8 +1437,8 @@ do_io:
                                     sectors << 9, offset << 9));
 
                /* Check for writing past i_size: */
-               WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-                       round_up(i_size, block_bytes(c)));
+               WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+                            round_up(i_size, block_bytes(c)));
 
                w->io->op.res.sectors += reserved_sectors;
                w->io->op.i_sectors_delta -= dirty_sectors;
@@ -1493,27 +1466,13 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
        if (w.io)
                bch2_writepage_do_io(&w);
        blk_finish_plug(&plug);
-       return ret;
-}
-
-int bch2_writepage(struct page *page, struct writeback_control *wbc)
-{
-       struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state w =
-               bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
-       int ret;
-
-       ret = __bch2_writepage(page, wbc, &w);
-       if (w.io)
-               bch2_writepage_do_io(&w);
-
-       return ret;
+       return bch2_err_class(ret);
 }
 
 /* buffered writes: */
 
 int bch2_write_begin(struct file *file, struct address_space *mapping,
-                    loff_t pos, unsigned len, unsigned flags,
+                    loff_t pos, unsigned len,
                     struct page **pagep, void **fsdata)
 {
        struct bch_inode_info *inode = to_bch_ei(mapping->host);
@@ -1533,7 +1492,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
-       page = grab_cache_page_write_begin(mapping, index, flags);
+       page = grab_cache_page_write_begin(mapping, index);
        if (!page)
                goto err_unlock;
 
@@ -1563,11 +1522,10 @@ out:
        if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
                ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
                if (ret)
-                       goto out;
+                       goto err;
        }
 
-       ret = bch2_page_reservation_get(c, inode, page, res,
-                                       offset, len, true);
+       ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
        if (ret) {
                if (!PageUptodate(page)) {
                        /*
@@ -1592,7 +1550,7 @@ err_unlock:
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
        kfree(res);
        *fsdata = NULL;
-       return ret;
+       return bch2_err_class(ret);
 }
 
 int bch2_write_end(struct file *file, struct address_space *mapping,
@@ -1664,7 +1622,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        bch2_page_reservation_init(c, inode, &res);
 
        for (i = 0; i < nr_pages; i++) {
-               pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+               pages[i] = grab_cache_page_write_begin(mapping, index + i);
                if (!pages[i]) {
                        nr_pages = i;
                        if (!i) {
@@ -1709,7 +1667,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                }
 
                ret = bch2_page_reservation_get(c, inode, page, &res,
-                                               pg_offset, pg_len, true);
+                                               pg_offset, pg_len);
                if (ret)
                        goto out;
 
@@ -1726,7 +1684,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                unsigned pg_len = min_t(unsigned, len - copied,
                                        PAGE_SIZE - pg_offset);
                unsigned pg_copied = copy_page_from_iter_atomic(page,
-                                               pg_offset, pg_len,iter);
+                                               pg_offset, pg_len, iter);
 
                if (!pg_copied)
                        break;
@@ -1808,11 +1766,11 @@ again:
                 * to check that the address is actually valid, when atomic
                 * usercopies are used, below.
                 */
-               if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
                        bytes = min_t(unsigned long, iov_iter_count(iter),
                                      PAGE_SIZE - offset);
 
-                       if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
                                ret = -EFAULT;
                                break;
                        }
@@ -1870,7 +1828,7 @@ static void bch2_dio_read_complete(struct closure *cl)
 {
        struct dio_read *dio = container_of(cl, struct dio_read, cl);
 
-       dio->req->ki_complete(dio->req, dio->ret, 0);
+       dio->req->ki_complete(dio->req, dio->ret);
        bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 }
 
@@ -1918,8 +1876,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
        shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
        iter->count -= shorten;
 
-       bio = bio_alloc_bioset(GFP_KERNEL,
-                              iov_iter_npages(iter, BIO_MAX_VECS),
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_READ,
+                              GFP_KERNEL,
                               &c->dio_read_bioset);
 
        bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -1953,8 +1913,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 
        goto start;
        while (iter->count) {
-               bio = bio_alloc_bioset(GFP_KERNEL,
-                                      iov_iter_npages(iter, BIO_MAX_VECS),
+               bio = bio_alloc_bioset(NULL,
+                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                                      REQ_OP_READ,
+                                      GFP_KERNEL,
                                       &c->bio_read);
                bio->bi_end_io          = bch2_direct_IO_read_split_endio;
 start:
@@ -2012,7 +1974,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                                        iocb->ki_pos,
                                        iocb->ki_pos + count - 1);
                if (ret < 0)
-                       return ret;
+                       goto out;
 
                file_accessed(file);
 
@@ -2027,8 +1989,8 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                ret = generic_file_read_iter(iocb, iter);
                bch2_pagecache_add_put(&inode->ei_pagecache_lock);
        }
-
-       return ret;
+out:
+       return bch2_err_class(ret);
 }
 
 /* O_DIRECT writes */
@@ -2070,7 +2032,7 @@ retry:
        offset = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (err == -EINTR)
+       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
                goto retry;
        bch2_trans_exit(&trans);
 
@@ -2096,12 +2058,10 @@ static long bch2_dio_write_loop(struct dio_write *dio)
        if (dio->loop)
                goto loop;
 
-       down(&c->io_in_flight);
-
        while (1) {
                iter_count = dio->iter.count;
 
-               if (kthread)
+               if (kthread && dio->mm)
                        kthread_use_mm(dio->mm);
                BUG_ON(current->faults_disabled_mapping);
                current->faults_disabled_mapping = mapping;
@@ -2111,7 +2071,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                dropped_locks = fdm_dropped_locks();
 
                current->faults_disabled_mapping = NULL;
-               if (kthread)
+               if (kthread && dio->mm)
                        kthread_unuse_mm(dio->mm);
 
                /*
@@ -2177,8 +2137,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                        struct iovec *iov = dio->inline_vecs;
 
                        if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-                               iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
-                                             GFP_KERNEL);
+                               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+                                                   GFP_KERNEL);
                                if (unlikely(!iov)) {
                                        dio->sync = sync = true;
                                        goto do_io;
@@ -2222,13 +2182,12 @@ loop:
                if (!dio->iter.count)
                        break;
 
-               bio_reset(bio);
+               bio_reset(bio, NULL, REQ_OP_WRITE);
                reinit_completion(&dio->done);
        }
 
        ret = dio->op.error ?: ((long) dio->written << 9);
 err:
-       up(&c->io_in_flight);
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
        bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
@@ -2243,8 +2202,11 @@ err:
        /* inode->i_dio_count is our ref on inode and thus bch_fs */
        inode_dio_end(&inode->v);
 
+       if (ret < 0)
+               ret = bch2_err_class(ret);
+
        if (!sync) {
-               req->ki_complete(req, ret, 0);
+               req->ki_complete(req, ret);
                ret = -EIOCBQUEUED;
        }
        return ret;
@@ -2303,10 +2265,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
                locked = false;
        }
 
-       bio = bio_alloc_bioset(GFP_KERNEL,
-                              iov_iter_is_bvec(iter)
-                              ? 0
-                              : iov_iter_npages(iter, BIO_MAX_VECS),
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_WRITE,
+                              GFP_KERNEL,
                               &c->dio_write_bioset);
        dio = container_of(bio, struct dio_write, op.wbio.bio);
        init_completion(&dio->done);
@@ -2349,8 +2311,10 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct bch_inode_info *inode = file_bch_inode(file);
        ssize_t ret;
 
-       if (iocb->ki_flags & IOCB_DIRECT)
-               return bch2_direct_write(iocb, from);
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               ret = bch2_direct_write(iocb, from);
+               goto out;
+       }
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(&inode->v);
@@ -2377,8 +2341,8 @@ unlock:
 
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
-
-       return ret;
+out:
+       return bch2_err_class(ret);
 }
 
 /* fsync: */
@@ -2412,7 +2376,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        ret2 = sync_inode_metadata(&inode->v, 1);
        ret3 = bch2_flush_inode(c, inode_inum(inode));
 
-       return ret ?: ret2 ?: ret3;
+       return bch2_err_class(ret ?: ret2 ?: ret3);
 }
 
 /* truncate: */
@@ -2446,7 +2410,7 @@ retry:
        start = iter.pos;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -2703,9 +2667,11 @@ int bch2_truncate(struct user_namespace *mnt_userns,
                        U64_MAX, &i_sectors_delta);
        i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
-               !bch2_journal_error(&c->journal));
-
+       bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+                               !bch2_journal_error(&c->journal), c,
+                               "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks,
+                               inode->ei_inode.bi_sectors);
        if (unlikely(ret))
                goto err;
 
@@ -2716,7 +2682,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
        ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
 err:
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       return ret;
+       return bch2_err_class(ret);
 }
 
 /* fallocate: */
@@ -2747,7 +2713,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 
        truncate_pagecache_range(&inode->v, offset, end - 1);
 
-       if (block_start < block_end ) {
+       if (block_start < block_end) {
                s64 i_sectors_delta = 0;
 
                ret = bch2_fpunch(c, inode_inum(inode),
@@ -2834,7 +2800,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        bch2_trans_copy_iter(&dst, &src);
        bch2_trans_copy_iter(&del, &src);
 
-       while (ret == 0 || ret == -EINTR) {
+       while (ret == 0 ||
+              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
@@ -2902,13 +2869,7 @@ reassemble:
 
                next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
-               if (copy.k->k.size == k.k->size) {
-                       /*
-                        * If we're moving the entire extent, we can skip
-                        * running triggers:
-                        */
-                       trigger_flags |= BTREE_TRIGGER_NORUN;
-               } else {
+               if (copy.k->k.size != k.k->size) {
                        /* We might end up splitting compressed extents: */
                        unsigned nr_ptrs =
                                bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
@@ -3042,14 +3003,14 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
                bch2_disk_reservation_put(c, &disk_res);
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        ret = 0;
        }
 
        bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
        mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
 
-       if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+       if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
                struct quota_res quota_res = { 0 };
                s64 i_sectors_delta = 0;
 
@@ -3100,7 +3061,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
         * so that the VFS cache i_size is consistent with the btree i_size:
         */
        if (ret &&
-           !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+           !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
                return ret;
 
        if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
@@ -3128,13 +3089,17 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        long ret;
 
-       if (!percpu_ref_tryget(&c->writes))
+       if (!percpu_ref_tryget_live(&c->writes))
                return -EROFS;
 
        inode_lock(&inode->v);
        inode_dio_wait(&inode->v);
        bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
+       ret = file_modified(file);
+       if (ret)
+               goto err;
+
        if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
                ret = bchfs_fallocate(inode, mode, offset, len);
        else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -3145,13 +3110,61 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
                ret = bchfs_fcollapse_finsert(inode, offset, len, false);
        else
                ret = -EOPNOTSUPP;
-
-
+err:
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
        inode_unlock(&inode->v);
        percpu_ref_put(&c->writes);
 
-       return ret;
+       return bch2_err_class(ret);
+}
+
+static int quota_reserve_range(struct bch_inode_info *inode,
+                              struct quota_res *res,
+                              u64 start, u64 end)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot;
+       u64 sectors = end - start;
+       u64 pos = start;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+              (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+              !(ret = bkey_err(k))) {
+               if (bkey_extent_is_allocation(k.k)) {
+                       u64 s = min(end, k.k->p.offset) -
+                               max(start, bkey_start_offset(k.k));
+                       BUG_ON(s > sectors);
+                       sectors -= s;
+               }
+               bch2_btree_iter_advance(&iter);
+       }
+       pos = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+
+       bch2_trans_exit(&trans);
+
+       if (ret)
+               return ret;
+
+       return bch2_quota_reservation_add(c, inode, res, sectors, true);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -3161,6 +3174,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
        struct bch_inode_info *src = file_bch_inode(file_src);
        struct bch_inode_info *dst = file_bch_inode(file_dst);
        struct bch_fs *c = src->v.i_sb->s_fs_info;
+       struct quota_res quota_res = { 0 };
        s64 i_sectors_delta = 0;
        u64 aligned_len;
        loff_t ret = 0;
@@ -3181,8 +3195,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
        bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
-       file_update_time(file_dst);
-
        inode_dio_wait(&src->v);
        inode_dio_wait(&dst->v);
 
@@ -3199,6 +3211,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
        if (ret)
                goto err;
 
+       ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+                                 (pos_dst + aligned_len) >> 9);
+       if (ret)
+               goto err;
+
+       file_update_time(file_dst);
+
        mark_pagecache_unallocated(src, pos_src >> 9,
                                   (pos_src + aligned_len) >> 9);
 
@@ -3215,8 +3234,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
         */
        ret = min((u64) ret << 9, (u64) len);
 
-       /* XXX get a quota reservation */
-       i_sectors_acct(c, dst, NULL, i_sectors_delta);
+       i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
        spin_lock(&dst->v.i_lock);
        if (pos_dst + ret > dst->v.i_size)
@@ -3227,9 +3245,10 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
            IS_SYNC(file_inode(file_dst)))
                ret = bch2_flush_inode(c, inode_inum(dst));
 err:
+       bch2_quota_reservation_put(c, dst, &quota_res);
        bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
-       return ret;
+       return bch2_err_class(ret);
 }
 
 /* fseek: */
@@ -3251,36 +3270,40 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
                                       loff_t start_offset,
                                       loff_t end_offset)
 {
-       struct address_space *mapping = vinode->i_mapping;
-       struct page *page;
+       struct folio_batch fbatch;
        pgoff_t start_index     = start_offset >> PAGE_SHIFT;
        pgoff_t end_index       = end_offset >> PAGE_SHIFT;
        pgoff_t index           = start_index;
+       unsigned i;
        loff_t ret;
        int offset;
 
-       while (index <= end_index) {
-               if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
-                       lock_page(page);
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(vinode->i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
 
-                       offset = page_data_offset(page,
-                                       page->index == start_index
+                       folio_lock(folio);
+
+                       offset = page_data_offset(&folio->page,
+                                       folio->index == start_index
                                        ? start_offset & (PAGE_SIZE - 1)
                                        : 0);
                        if (offset >= 0) {
-                               ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+                               ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
                                            offset,
                                            start_offset, end_offset);
-                               unlock_page(page);
-                               put_page(page);
+                               folio_unlock(folio);
+                               folio_batch_release(&fbatch);
                                return ret;
                        }
 
-                       unlock_page(page);
-                       put_page(page);
-               } else {
-                       break;
+                       folio_unlock(folio);
                }
+               folio_batch_release(&fbatch);
+               cond_resched();
        }
 
        return end_offset;
@@ -3322,7 +3345,7 @@ retry:
        }
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -3437,7 +3460,7 @@ retry:
        }
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -3452,18 +3475,26 @@ err:
 
 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 {
+       loff_t ret;
+
        switch (whence) {
        case SEEK_SET:
        case SEEK_CUR:
        case SEEK_END:
-               return generic_file_llseek(file, offset, whence);
+               ret = generic_file_llseek(file, offset, whence);
+               break;
        case SEEK_DATA:
-               return bch2_seek_data(file, offset);
+               ret = bch2_seek_data(file, offset);
+               break;
        case SEEK_HOLE:
-               return bch2_seek_hole(file, offset);
+               ret = bch2_seek_hole(file, offset);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
        }
 
-       return -EINVAL;
+       return bch2_err_class(ret);
 }
 
 void bch2_fs_fsio_exit(struct bch_fs *c)
index b24efeaf343e75279507009c6f9147916ad5c5c6..a8835298613a95954e4311b2ecec4088d7d47e91 100644 (file)
@@ -15,14 +15,13 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
                                       struct bch_inode_info *,
                                       loff_t, unsigned);
 
-int bch2_writepage(struct page *, struct writeback_control *);
-int bch2_readpage(struct file *, struct page *);
+int bch2_read_folio(struct file *, struct folio *);
 
 int bch2_writepages(struct address_space *, struct writeback_control *);
 void bch2_readahead(struct readahead_control *);
 
 int bch2_write_begin(struct file *, struct address_space *, loff_t,
-                    unsigned, unsigned, struct page **, void **);
+                    unsigned, struct page **, void **);
 int bch2_write_end(struct file *, struct address_space *, loff_t,
                   unsigned, unsigned, struct page *, void *);
 
@@ -42,10 +41,8 @@ loff_t bch2_llseek(struct file *, loff_t, int);
 
 vm_fault_t bch2_page_fault(struct vm_fault *);
 vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
-int bch2_releasepage(struct page *, gfp_t);
-int bch2_migrate_page(struct address_space *, struct page *,
-                     struct page *, enum migrate_mode);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
 
 void bch2_fs_fsio_exit(struct bch_fs *);
 int bch2_fs_fsio_init(struct bch_fs *);
index 9f329a624c1270628aa4732b0de530bdd1b57056..2bb680827b44763783f663e6c2a5b223852430e4 100644 (file)
@@ -26,6 +26,9 @@ struct flags_set {
        unsigned                flags;
 
        unsigned                projid;
+
+       bool                    set_projinherit;
+       bool                    projinherit;
 };
 
 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
            (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
                return -EINVAL;
 
+       if (s->set_projinherit) {
+               bi->bi_fields_set &= ~(1 << Inode_opt_project);
+               bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+       }
+
        bi->bi_flags &= ~s->mask;
        bi->bi_flags |= newflags;
 
@@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
        struct fsxattr fa = { 0 };
 
        fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+       if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+               fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
        fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
        return copy_to_user(arg, &fa, sizeof(fa));
@@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
 
+       s.set_projinherit = true;
+       s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+       fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
        s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
        if (fa.fsx_xflags)
                return -EOPNOTSUPP;
@@ -455,51 +471,67 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       long ret;
 
        switch (cmd) {
        case FS_IOC_GETFLAGS:
-               return bch2_ioc_getflags(inode, (int __user *) arg);
+               ret = bch2_ioc_getflags(inode, (int __user *) arg);
+               break;
 
        case FS_IOC_SETFLAGS:
-               return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+               ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+               break;
 
        case FS_IOC_FSGETXATTR:
-               return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+               ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+               break;
+
        case FS_IOC_FSSETXATTR:
-               return bch2_ioc_fssetxattr(c, file, inode,
-                                          (void __user *) arg);
+               ret = bch2_ioc_fssetxattr(c, file, inode,
+                                         (void __user *) arg);
+               break;
 
        case BCHFS_IOC_REINHERIT_ATTRS:
-               return bch2_ioc_reinherit_attrs(c, file, inode,
-                                               (void __user *) arg);
+               ret = bch2_ioc_reinherit_attrs(c, file, inode,
+                                              (void __user *) arg);
+               break;
 
        case FS_IOC_GETVERSION:
-               return -ENOTTY;
+               ret = -ENOTTY;
+               break;
+
        case FS_IOC_SETVERSION:
-               return -ENOTTY;
+               ret = -ENOTTY;
+               break;
 
        case FS_IOC_GOINGDOWN:
-               return bch2_ioc_goingdown(c, (u32 __user *) arg);
+               ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+               break;
 
        case BCH_IOCTL_SUBVOLUME_CREATE: {
                struct bch_ioctl_subvolume i;
 
-               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
-                       return -EFAULT;
-               return bch2_ioctl_subvolume_create(c, file, i);
+               ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+                       ? -EFAULT
+                       : bch2_ioctl_subvolume_create(c, file, i);
+               break;
        }
 
        case BCH_IOCTL_SUBVOLUME_DESTROY: {
                struct bch_ioctl_subvolume i;
 
-               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
-                       return -EFAULT;
-               return bch2_ioctl_subvolume_destroy(c, file, i);
+               ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+                       ? -EFAULT
+                       : bch2_ioctl_subvolume_destroy(c, file, i);
+               break;
        }
 
        default:
-               return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+               ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+               break;
        }
+
+       return bch2_err_class(ret);
 }
 
 #ifdef CONFIG_COMPAT
index 91fa1897db98358c8d2f9f246002463d13df0822..186faa54b590f1e736c1b660d5c83d1bc56cdbda 100644 (file)
@@ -8,6 +8,7 @@
 #include "buckets.h"
 #include "chardev.h"
 #include "dirent.h"
+#include "errcode.h"
 #include "extents.h"
 #include "fs.h"
 #include "fs-common.h"
@@ -30,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/seq_file.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/xattr.h>
@@ -152,7 +154,7 @@ retry:
 
        bch2_trans_iter_exit(&trans, &iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -322,7 +324,7 @@ retry:
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
 err_before_quota:
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
                goto err_trans;
        }
@@ -417,7 +419,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
                              (subvol_inum) { 0 }, 0);
 
        if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
 
        d_instantiate(dentry, &inode->v);
        return 0;
@@ -442,7 +444,7 @@ static int __bch2_link(struct bch_fs *c,
        mutex_lock(&inode->ei_update_lock);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                        bch2_link_trans(&trans,
                                        inode_inum(dir),   &dir_u,
                                        inode_inum(inode), &inode_u,
@@ -491,7 +493,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, NULL,
+       ret = commit_do(&trans, NULL, NULL,
                              BTREE_INSERT_NOFAIL,
                        bch2_unlink_trans(&trans,
                                          inode_inum(dir), &dir_u,
@@ -526,8 +528,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
 
        inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
                              (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-       if (unlikely(IS_ERR(inode)))
-               return PTR_ERR(inode);
+       if (IS_ERR(inode))
+               return bch2_err_class(PTR_ERR(inode));
 
        inode_lock(&inode->v);
        ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -613,7 +615,7 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
                        goto err;
        }
 
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                        bch2_rename_trans(&trans,
                                          inode_inum(src_dir), &src_dir_u,
                                          inode_inum(dst_dir), &dst_dir_u,
@@ -753,7 +755,7 @@ retry:
 btree_err:
        bch2_trans_iter_exit(&trans, &inode_iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err_trans;
@@ -767,7 +769,7 @@ err_trans:
 err:
        mutex_unlock(&inode->ei_update_lock);
 
-       return ret;
+       return bch2_err_class(ret);
 }
 
 static int bch2_getattr(struct user_namespace *mnt_userns,
@@ -836,7 +838,7 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
                              (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
        if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
 
        d_mark_tmpfile(dentry, &inode->v);
        d_instantiate(dentry, &inode->v);
@@ -933,9 +935,9 @@ retry:
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                             SPOS(ei->v.i_ino, start, snapshot), 0);
 
-       while ((k = bch2_btree_iter_peek(&iter)).k &&
-              !(ret = bkey_err(k)) &&
-              bkey_cmp(iter.pos, end) < 0) {
+       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+              (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+              !(ret = bkey_err(k))) {
                enum btree_id data_btree = BTREE_ID_extents;
 
                if (!bkey_extent_is_data(k.k) &&
@@ -984,7 +986,7 @@ retry:
        start = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (!ret && have_extent)
@@ -1111,18 +1113,17 @@ static const struct inode_operations bch_special_inode_operations = {
 };
 
 static const struct address_space_operations bch_address_space_operations = {
-       .writepage      = bch2_writepage,
-       .readpage       = bch2_readpage,
+       .read_folio     = bch2_read_folio,
        .writepages     = bch2_writepages,
        .readahead      = bch2_readahead,
-       .set_page_dirty = __set_page_dirty_nobuffers,
+       .dirty_folio    = filemap_dirty_folio,
        .write_begin    = bch2_write_begin,
        .write_end      = bch2_write_end,
-       .invalidatepage = bch2_invalidatepage,
-       .releasepage    = bch2_releasepage,
+       .invalidate_folio = bch2_invalidate_folio,
+       .release_folio  = bch2_release_folio,
        .direct_IO      = noop_direct_IO,
 #ifdef CONFIG_MIGRATION
-       .migratepage    = bch2_migrate_page,
+       .migrate_folio  = filemap_migrate_folio,
 #endif
        .error_remove_page = generic_error_remove_page,
 };
@@ -1335,7 +1336,7 @@ found:
        memcpy(name, d.v->d_name, name_len);
        name[name_len] = '\0';
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_iter_exit(&trans, &iter1);
@@ -1452,7 +1453,7 @@ static int bch2_vfs_write_inode(struct inode *vinode,
                               ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
-       return ret;
+       return bch2_err_class(ret);
 }
 
 static void bch2_evict_inode(struct inode *vinode)
@@ -1476,7 +1477,7 @@ static void bch2_evict_inode(struct inode *vinode)
 }
 
 void bch2_evict_subvolume_inodes(struct bch_fs *c,
-                                struct snapshot_id_list *s)
+                                snapshot_id_list *s)
 {
        struct super_block *sb = c->vfs_sb;
        struct inode *inode;
@@ -1556,6 +1557,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int bch2_sync_fs(struct super_block *sb, int wait)
 {
        struct bch_fs *c = sb->s_fs_info;
+       int ret;
 
        if (c->opts.journal_flush_disabled)
                return 0;
@@ -1565,7 +1567,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
 
-       return bch2_journal_flush(&c->journal);
+       ret = bch2_journal_flush(&c->journal);
+       return bch2_err_class(ret);
 }
 
 static struct bch_fs *bch2_path_to_fs(const char *path)
@@ -1621,7 +1624,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
        ret = bch2_parse_mount_opts(c, &opts, data);
        if (ret)
-               return ret;
+               goto err;
 
        if (opts.read_only != c->opts.read_only) {
                down_write(&c->state_lock);
@@ -1635,7 +1638,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
                        if (ret) {
                                bch_err(c, "error going rw: %i", ret);
                                up_write(&c->state_lock);
-                               return -EINVAL;
+                               ret = -EINVAL;
+                               goto err;
                        }
 
                        sb->s_flags &= ~SB_RDONLY;
@@ -1648,8 +1652,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
        if (opts.errors >= 0)
                c->opts.errors = opts.errors;
-
-       return ret;
+err:
+       return bch2_err_class(ret);
 }
 
 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
@@ -1674,7 +1678,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct bch_fs *c = root->d_sb->s_fs_info;
        enum bch_opt_id i;
-       char buf[512];
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
 
        for (i = 0; i < bch2_opts_nr; i++) {
                const struct bch_option *opt = &bch2_opt_table[i];
@@ -1686,13 +1691,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
                        continue;
 
-               bch2_opt_to_text(&PBUF(buf), c, opt, v,
+               printbuf_reset(&buf);
+               bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
                                 OPT_SHOW_MOUNT_STYLE);
                seq_putc(seq, ',');
-               seq_puts(seq, buf);
+               seq_puts(seq, buf.buf);
        }
 
-       return 0;
+       if (buf.allocation_failure)
+               ret = -ENOMEM;
+       printbuf_exit(&buf);
+       return ret;
 }
 
 static void bch2_put_super(struct super_block *sb)
@@ -1837,7 +1846,7 @@ got_sb:
        sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
        sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
        c->vfs_sb               = sb;
-       strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+       strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
        ret = super_setup_bdi(sb);
        if (ret)
@@ -1865,10 +1874,9 @@ got_sb:
        sb->s_shrink.seeks = 0;
 
        vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-       if (IS_ERR(vinode)) {
-               bch_err(c, "error mounting: error getting root inode %i",
-                       (int) PTR_ERR(vinode));
-               ret = PTR_ERR(vinode);
+       ret = PTR_ERR_OR_ZERO(vinode);
+       if (ret) {
+               bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
                goto err_put_super;
        }
 
@@ -1909,8 +1917,7 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
        unregister_filesystem(&bcache_fs_type);
-       if (bch2_inode_cache)
-               kmem_cache_destroy(bch2_inode_cache);
+       kmem_cache_destroy(bch2_inode_cache);
 }
 
 int __init bch2_vfs_init(void)
index b2211ec7f3028600a941014057440dc3dc056c7a..9f4b57e30e2a7d14e8c66598c2203cbf970d319e 100644 (file)
@@ -191,7 +191,7 @@ int bch2_setattr_nonsize(struct user_namespace *,
                         struct iattr *);
 int __bch2_unlink(struct inode *, struct dentry *, bool);
 
-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
@@ -199,7 +199,7 @@ int bch2_vfs_init(void);
 #else
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-                                              struct snapshot_id_list *s) {}
+                                              snapshot_id_list *s) {}
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
index ced4d671eb8d707e49b8600a5bfa607c4711751f..ca95d85b73488ef849c54ab26e044d72a7e76e32 100644 (file)
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "bkey_buf.h"
 #include "btree_update.h"
+#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
                                    u32 snapshot)
 {
@@ -135,9 +140,9 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 
        ret = bch2_inode_unpack(k, inode);
 err:
-       if (ret && ret != -EINTR)
-               bch_err(trans->c, "error %i fetching inode %llu",
-                       ret, inode_nr);
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(trans->c, "error fetching inode %llu: %s",
+                       inode_nr, bch2_err_str(ret));
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -163,9 +168,9 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
        if (!ret)
                *snapshot = iter.pos.snapshot;
 err:
-       if (ret && ret != -EINTR)
-               bch_err(trans->c, "error %i fetching inode %llu:%u",
-                       ret, inode_nr, *snapshot);
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(trans->c, "error fetching inode %llu:%u: %s",
+                       inode_nr, *snapshot, bch2_err_str(ret));
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -219,35 +224,39 @@ static int write_inode(struct btree_trans *trans,
                       struct bch_inode_unpacked *inode,
                       u32 snapshot)
 {
-       int ret = __bch2_trans_do(trans, NULL, NULL,
+       int ret = commit_do(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW,
                                  __write_inode(trans, inode, snapshot));
        if (ret)
-               bch_err(trans->c, "error in fsck: error %i updating inode", ret);
+               bch_err(trans->c, "error in fsck: error updating inode: %s",
+                       bch2_err_str(ret));
        return ret;
 }
 
 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
        struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
        int ret;
 
-       ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-                                             SPOS(inum, 0, snapshot),
-                                             SPOS(inum, U64_MAX, snapshot),
-                                             0, NULL) ?:
-               bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-                                             SPOS(inum, 0, snapshot),
-                                             SPOS(inum, U64_MAX, snapshot),
-                                             0, NULL) ?:
-               bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-                                             SPOS(inum, 0, snapshot),
-                                             SPOS(inum, U64_MAX, snapshot),
-                                             0, NULL);
+       do {
+               ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                                                     SPOS(inum, 0, snapshot),
+                                                     SPOS(inum, U64_MAX, snapshot),
+                                                     0, NULL) ?:
+                       bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+                                                     SPOS(inum, 0, snapshot),
+                                                     SPOS(inum, U64_MAX, snapshot),
+                                                     0, NULL) ?:
+                       bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+                                                     SPOS(inum, 0, snapshot),
+                                                     SPOS(inum, U64_MAX, snapshot),
+                                                     0, NULL);
+       } while (ret == -BCH_ERR_transaction_restart_nested);
        if (ret)
                goto err;
 retry:
@@ -262,7 +271,7 @@ retry:
                goto err;
 
        if (!bkey_is_inode(k.k)) {
-               bch2_fs_inconsistent(trans->c,
+               bch2_fs_inconsistent(c,
                                     "inode %llu:%u not found when deleting",
                                     inum, snapshot);
                ret = -EIO;
@@ -272,11 +281,8 @@ retry:
        bch2_inode_unpack(k, &inode_u);
 
        /* Subvolume root? */
-       if (inode_u.bi_subvol) {
-               ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
-               if (ret)
-                       goto err;
-       }
+       if (inode_u.bi_subvol)
+               bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
 
        bkey_inode_generation_init(&delete.k_i);
        delete.k.p = iter.pos;
@@ -287,10 +293,10 @@ retry:
                                BTREE_INSERT_NOFAIL);
 err:
        bch2_trans_iter_exit(trans, &iter);
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       return ret;
+       return ret ?: -BCH_ERR_transaction_restart_nested;
 }
 
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
@@ -303,15 +309,19 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 
        ret = lookup_first_inode(trans, pos.inode, &dir_inode);
        if (ret)
-               return ret;
+               goto err;
 
        dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
        ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                 &dir_hash_info, &iter, 0);
+                                 &dir_hash_info, &iter,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
        bch2_trans_iter_exit(trans, &iter);
+err:
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -346,8 +356,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
                goto create_lostfound;
        }
 
-       if (ret && ret != -EINTR)
-               bch_err(c, "error looking up lost+found: %i", ret);
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
        if (ret)
                return ret;
 
@@ -369,8 +379,8 @@ create_lostfound:
                                lostfound, &lostfound_str,
                                0, 0, S_IFDIR|0700, 0, NULL, NULL,
                                (subvol_inum) { }, 0);
-       if (ret && ret != -EINTR)
-               bch_err(c, "error creating lost+found: %i", ret);
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -429,13 +439,13 @@ static int reattach_inode(struct btree_trans *trans,
                          struct bch_inode_unpacked *inode,
                          u32 inode_snapshot)
 {
-       int ret = __bch2_trans_do(trans, NULL, NULL,
+       int ret = commit_do(trans, NULL, NULL,
                                  BTREE_INSERT_LAZY_RW|
                                  BTREE_INSERT_NOFAIL,
                        __reattach_inode(trans, inode, inode_snapshot));
        if (ret) {
-               bch_err(trans->c, "error %i reattaching inode %llu",
-                       ret, inode->bi_inum);
+               bch_err(trans->c, "error reattaching inode %llu: %s",
+                       inode->bi_inum, bch2_err_str(ret));
                return ret;
        }
 
@@ -466,19 +476,82 @@ out:
        return ret;
 }
 
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+struct snapshots_seen_entry {
+       u32                             id;
+       u32                             equiv;
+};
+
+struct snapshots_seen {
+       struct bpos                     pos;
+       DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+       darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+       memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+       struct snapshots_seen_entry *i, n = { id, id };
+       int ret;
+
+       darray_for_each(s->ids, i) {
+               if (n.equiv < i->equiv)
+                       break;
+
+               if (i->equiv == n.equiv) {
+                       bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
+                       return -EINVAL;
+               }
+       }
+
+       ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+       if (ret)
+               bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+                       s->ids.size);
+       return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+                                enum btree_id btree_id, struct bpos pos)
 {
-       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+       struct snapshots_seen_entry *i, n = {
+               .id     = pos.snapshot,
+               .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
+       };
+       int ret = 0;
 
        if (bkey_cmp(s->pos, pos))
-               s->nr = 0;
+               s->ids.nr = 0;
+
+       pos.snapshot = n.equiv;
        s->pos = pos;
 
-       /* Might get called multiple times due to lock restarts */
-       if (s->nr && s->d[s->nr - 1] == pos.snapshot)
-               return 0;
+       darray_for_each(s->ids, i)
+               if (i->equiv == n.equiv) {
+                       if (fsck_err_on(i->id != n.id, c,
+                                       "snapshot deletion did not run correctly:\n"
+                                       "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+                                       bch2_btree_ids[btree_id],
+                                       pos.inode, pos.offset,
+                                       i->id, n.id, n.equiv))
+                               return -BCH_ERR_need_snapshot_cleanup;
+
+                       return 0;
+               }
 
-       return snapshots_seen_add(c, s, pos.snapshot);
+       ret = darray_push(&s->ids, n);
+       if (ret)
+               bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+                       s->ids.size);
+fsck_err:
+       return ret;
 }
 
 /**
@@ -491,15 +564,15 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
                                    u32 id, u32 ancestor)
 {
        ssize_t i;
+       u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
 
        BUG_ON(id > ancestor);
-
-       id              = snapshot_t(c, id)->equiv;
-       ancestor        = snapshot_t(c, ancestor)->equiv;
+       BUG_ON(!bch2_snapshot_is_equiv(c, id));
+       BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
 
        /* @ancestor should be the snapshot most recently added to @seen */
-       BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
-       BUG_ON(seen->pos.snapshot != ancestor);
+       BUG_ON(ancestor != seen->pos.snapshot);
+       BUG_ON(ancestor != top);
 
        if (id == ancestor)
                return true;
@@ -507,11 +580,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
        if (!bch2_snapshot_is_ancestor(c, id, ancestor))
                return false;
 
-       for (i = seen->nr - 2;
-            i >= 0 && seen->d[i] >= id;
+       for (i = seen->ids.nr - 2;
+            i >= 0 && seen->ids.data[i].equiv >= id;
             --i)
-               if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
-                   bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+               if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
+                   bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
                        return false;
 
        return true;
@@ -536,27 +609,27 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
                : bch2_snapshot_is_ancestor(c, src, dst);
 }
 
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)      \
-       for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                              \
+       for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
+            (_i)->snapshot <= (_snapshot); _i++)                                       \
                if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 
+struct inode_walker_entry {
+       struct bch_inode_unpacked inode;
+       u32                     snapshot;
+       u64                     count;
+};
+
 struct inode_walker {
        bool                            first_this_inode;
        u64                             cur_inum;
 
-       size_t                          nr;
-       size_t                          size;
-       struct inode_walker_entry {
-               struct bch_inode_unpacked inode;
-               u32                     snapshot;
-               u64                     count;
-       } *d;
+       DARRAY(struct inode_walker_entry) inodes;
 };
 
 static void inode_walker_exit(struct inode_walker *w)
 {
-       kfree(w->d);
-       w->d = NULL;
+       darray_exit(&w->inodes);
 }
 
 static struct inode_walker inode_walker_init(void)
@@ -564,43 +637,17 @@ static struct inode_walker inode_walker_init(void)
        return (struct inode_walker) { 0, };
 }
 
-static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
-{
-       if (w->nr == w->size) {
-               size_t new_size = max_t(size_t, 8UL, w->size * 2);
-               void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
-                                  GFP_KERNEL);
-               if (!d) {
-                       bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
-                               new_size);
-                       return -ENOMEM;
-               }
-
-               w->d = d;
-               w->size = new_size;
-       }
-
-       return 0;
-}
-
 static int add_inode(struct bch_fs *c, struct inode_walker *w,
                     struct bkey_s_c inode)
 {
        struct bch_inode_unpacked u;
-       int ret;
-
-       ret = inode_walker_realloc(c, w);
-       if (ret)
-               return ret;
 
        BUG_ON(bch2_inode_unpack(inode, &u));
 
-       w->d[w->nr++] = (struct inode_walker_entry) {
+       return darray_push(&w->inodes, ((struct inode_walker_entry) {
                .inode          = u,
-               .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
-       };
-
-       return 0;
+               .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+       }));
 }
 
 static int __walk_inode(struct btree_trans *trans,
@@ -609,17 +656,18 @@ static int __walk_inode(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
-       unsigned i, ancestor_pos;
+       u32 restart_count = trans->restart_count;
+       unsigned i;
        int ret;
 
-       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+       pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
 
        if (pos.inode == w->cur_inum) {
                w->first_this_inode = false;
                goto lookup_snapshot;
        }
 
-       w->nr = 0;
+       w->inodes.nr = 0;
 
        for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -636,27 +684,33 @@ static int __walk_inode(struct btree_trans *trans,
 
        w->cur_inum             = pos.inode;
        w->first_this_inode     = true;
+
+       if (trans_was_restarted(trans, restart_count))
+               return -BCH_ERR_transaction_restart_nested;
+
 lookup_snapshot:
-       for (i = 0; i < w->nr; i++)
-               if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+       for (i = 0; i < w->inodes.nr; i++)
+               if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
                        goto found;
        return INT_MAX;
 found:
-       BUG_ON(pos.snapshot > w->d[i].snapshot);
+       BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
+
+       if (pos.snapshot != w->inodes.data[i].snapshot) {
+               struct inode_walker_entry e = w->inodes.data[i];
+
+               e.snapshot = pos.snapshot;
+               e.count = 0;
 
-       if (pos.snapshot != w->d[i].snapshot) {
-               ancestor_pos = i;
+               bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+                        pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
 
-               while (i && w->d[i - 1].snapshot > pos.snapshot)
+               while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
                        --i;
 
-               ret = inode_walker_realloc(c, w);
+               ret = darray_insert_item(&w->inodes, i, e);
                if (ret)
                        return ret;
-
-               array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
-               w->d[i].snapshot = pos.snapshot;
-               w->d[i].count   = 0;
        }
 
        return i;
@@ -672,21 +726,23 @@ static int __get_visible_inodes(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
-       w->nr = 0;
+       w->inodes.nr = 0;
 
-       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
                if (k.k->p.offset != inum)
                        break;
 
-               if (!bkey_is_inode(k.k))
+               if (!ref_visible(c, s, s->pos.snapshot, equiv))
                        continue;
 
-               if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+               if (bkey_is_inode(k.k))
                        add_inode(c, w, k);
-                       if (k.k->p.snapshot >= s->pos.snapshot)
-                               break;
-               }
+
+               if (equiv >= s->pos.snapshot)
+                       break;
        }
        bch2_trans_iter_exit(trans, &iter);
 
@@ -698,15 +754,16 @@ static int check_key_has_snapshot(struct btree_trans *trans,
                                  struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
-       if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+       if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
                        "key in missing snapshot: %s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return bch2_btree_delete_at(trans, iter,
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+               ret = bch2_btree_delete_at(trans, iter,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -715,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans,
                         struct bch_hash_info *hash_info,
                         struct btree_iter *k_iter, struct bkey_s_c k)
 {
-       bch_err(trans->c, "hash_redo_key() not implemented yet");
-       return -EINVAL;
-#if 0
        struct bkey_i *delete;
        struct bkey_i *tmp;
 
@@ -735,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans,
        delete->k.p = k_iter->pos;
        return  bch2_btree_iter_traverse(k_iter) ?:
                bch2_trans_update(trans, k_iter, delete, 0) ?:
-               bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-#endif
+               bch2_hash_set_snapshot(trans, desc, hash_info,
+                                      (subvol_inum) { 0, k.k->p.inode },
+                                      k.k->p.snapshot, tmp,
+                                      BCH_HASH_SET_MUST_CREATE,
+                                      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BTREE_INSERT_NOFAIL|
+                                 BTREE_INSERT_LAZY_RW);
 }
 
 static int hash_check_key(struct btree_trans *trans,
@@ -746,7 +806,7 @@ static int hash_check_key(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter = { NULL };
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        struct bkey_s_c k;
        u64 hash;
        int ret = 0;
@@ -762,16 +822,18 @@ static int hash_check_key(struct btree_trans *trans,
        if (hash_k.k->p.offset < hash)
                goto bad_hash;
 
-       for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
-                          BTREE_ITER_SLOTS, k, ret) {
+       for_each_btree_key_norestart(trans, iter, desc.btree_id,
+                                    POS(hash_k.k->p.inode, hash),
+                                    BTREE_ITER_SLOTS, k, ret) {
                if (!bkey_cmp(k.k->p, hash_k.k->p))
                        break;
 
                if (fsck_err_on(k.k->type == desc.key_type &&
                                !desc.cmp_bkey(k, hash_k), c,
                                "duplicate hash table keys:\n%s",
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      hash_k), buf))) {
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, hash_k),
+                                buf.buf))) {
                        ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
                        break;
                }
@@ -780,49 +842,49 @@ static int hash_check_key(struct btree_trans *trans,
                        bch2_trans_iter_exit(trans, &iter);
                        goto bad_hash;
                }
-
        }
+out:
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
        return ret;
 bad_hash:
-       if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
+       if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
                     "hashed to %llu\n%s",
-                    desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
-                    (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
-               return 0;
-
-       ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-       if (ret) {
-               bch_err(c, "hash_redo_key err %i", ret);
-               return ret;
+                    bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+                    (printbuf_reset(&buf),
+                     bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+               ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+               if (ret) {
+                       bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+                       return ret;
+               }
+               ret = -BCH_ERR_transaction_restart_nested;
        }
-       return -EINTR;
 fsck_err:
-       return ret;
+       goto out;
 }
 
 static int check_inode(struct btree_trans *trans,
                       struct btree_iter *iter,
+                      struct bkey_s_c k,
                       struct bch_inode_unpacked *prev,
+                      struct snapshots_seen *s,
                       bool full)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
        struct bch_inode_unpacked u;
        bool do_update = false;
        int ret;
 
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret < 0)
+               goto err;
        if (ret)
-               return ret;
+               return 0;
 
-       ret = check_key_has_snapshot(trans, iter, k);
+       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
        if (ret)
-               return ret < 0 ? ret : 0;
+               goto err;
 
        /*
         * if snapshot id isn't a leaf node, skip it - deletion in
@@ -861,8 +923,9 @@ static int check_inode(struct btree_trans *trans,
                bch2_fs_lazy_rw(c);
 
                ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
-               if (ret)
-                       bch_err(c, "error in fsck: error %i while deleting inode", ret);
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       bch_err(c, "error in fsck: error while deleting inode: %s",
+                               bch2_err_str(ret));
                return ret;
        }
 
@@ -885,7 +948,8 @@ static int check_inode(struct btree_trans *trans,
                                POS(u.bi_inum, U64_MAX),
                                0, NULL);
                if (ret) {
-                       bch_err(c, "error in fsck: error %i truncating inode", ret);
+                       bch_err(c, "error in fsck: error truncating inode: %s",
+                               bch2_err_str(ret));
                        return ret;
                }
 
@@ -910,8 +974,8 @@ static int check_inode(struct btree_trans *trans,
 
                sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
                if (sectors < 0) {
-                       bch_err(c, "error in fsck: error %i recounting inode sectors",
-                               (int) sectors);
+                       bch_err(c, "error in fsck: error recounting inode sectors: %s",
+                               bch2_err_str(sectors));
                        return sectors;
                }
 
@@ -928,12 +992,15 @@ static int check_inode(struct btree_trans *trans,
        }
 
        if (do_update) {
-               ret = write_inode(trans, &u, iter->pos.snapshot);
+               ret = __write_inode(trans, &u, iter->pos.snapshot);
                if (ret)
-                       bch_err(c, "error in fsck: error %i "
-                               "updating inode", ret);
+                       bch_err(c, "error in fsck: error updating inode: %s",
+                               bch2_err_str(ret));
        }
+err:
 fsck_err:
+       if (ret)
+               bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -943,86 +1010,23 @@ static int check_inodes(struct bch_fs *c, bool full)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bch_inode_unpacked prev = { 0 };
-       int ret;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       do {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_LAZY_RW|
-                                     BTREE_INSERT_NOFAIL,
-                       check_inode(&trans, &iter, &prev, full));
-               if (ret)
-                       break;
-       } while (bch2_btree_iter_advance(&iter));
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-       return ret;
-}
-
-static int check_subvol(struct btree_trans *trans,
-                       struct btree_iter *iter)
-{
+       struct snapshots_seen s;
        struct bkey_s_c k;
-       struct bkey_s_c_subvolume subvol;
-       int ret;
-
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (k.k->type != KEY_TYPE_subvolume)
-               return 0;
-
-       subvol = bkey_s_c_to_subvolume(k);
-
-       if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-               ret = bch2_subvolume_delete(trans, iter->pos.offset);
-               if (ret && ret != -EINTR)
-                       bch_err(trans->c, "error deleting subvolume %llu: %i",
-                               iter->pos.offset, ret);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-noinline_for_stack
-static int check_subvols(struct bch_fs *c)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
        int ret;
 
+       snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
-                            POS_MIN,
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH);
-
-       do {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_LAZY_RW|
-                                     BTREE_INSERT_NOFAIL,
-                                     check_subvol(&trans, &iter));
-               if (ret)
-                       break;
-       } while (bch2_btree_iter_advance(&iter));
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+                       POS_MIN,
+                       BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_inode(&trans, &iter, k, &prev, &s, full));
 
        bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
+       if (ret)
+               bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1114,7 +1118,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
                        SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
        ret = bkey_err(d.s_c);
        if (ret)
-               return ret;
+               return ret == -ENOENT ? 0 : ret;
 
        ret = dirent_points_to_inode(d, inode);
        bch2_trans_iter_exit(trans, &iter);
@@ -1125,15 +1129,15 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 {
        struct bch_fs *c = trans->c;
        struct inode_walker_entry *i;
-       int ret = 0, ret2 = 0;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
        s64 count2;
 
-       for (i = w->d; i < w->d + w->nr; i++) {
+       darray_for_each(w->inodes, i) {
                if (i->inode.bi_sectors == i->count)
                        continue;
 
-               count2 = lockrestart_do(trans,
-                       bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+               count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
 
                if (i->count != count2) {
                        bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
@@ -1146,53 +1150,55 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
                if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
                            "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
                            w->cur_inum, i->snapshot,
-                           i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
-                       continue;
-
-               i->inode.bi_sectors = i->count;
-               ret = write_inode(trans, &i->inode, i->snapshot);
-               if (ret)
-                       break;
-               ret2 = -EINTR;
+                           i->inode.bi_sectors, i->count)) {
+                       i->inode.bi_sectors = i->count;
+                       ret = write_inode(trans, &i->inode, i->snapshot);
+                       if (ret)
+                               break;
+               }
        }
 fsck_err:
-       return ret ?: ret2;
+       if (ret)
+               bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
+       if (!ret && trans_was_restarted(trans, restart_count))
+               ret = -BCH_ERR_transaction_restart_nested;
+       return ret;
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+                       struct bkey_s_c k,
                        struct inode_walker *inode,
                        struct snapshots_seen *s)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
        struct inode_walker_entry *i;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
+       struct bpos equiv;
        int ret = 0;
 
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
        ret = check_key_has_snapshot(trans, iter, k);
-       if (ret)
-               return ret < 0 ? ret : 0;
+       if (ret) {
+               ret = ret < 0 ? ret : 0;
+               goto out;
+       }
+
+       equiv = k.k->p;
+       equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
 
-       ret = snapshots_seen_update(c, s, k.k->p);
+       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
        if (ret)
-               return ret;
+               goto err;
 
        if (k.k->type == KEY_TYPE_whiteout)
-               return 0;
+               goto out;
 
        if (inode->cur_inum != k.k->p.inode) {
                ret = check_i_sectors(trans, inode);
                if (ret)
-                       return ret;
+                       goto err;
        }
+
+       BUG_ON(!iter->path->should_be_locked);
 #if 0
        if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
                char buf1[200];
@@ -1201,59 +1207,95 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
                bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
-               if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-                       return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+               if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
+                       ret = fix_overlapping_extent(trans, k, prev.k->k.p)
+                               ?: -BCH_ERR_transaction_restart_nested;
+                       goto out;
+               }
        }
 #endif
-       ret = __walk_inode(trans, inode, k.k->p);
+       ret = __walk_inode(trans, inode, equiv);
        if (ret < 0)
-               return ret;
+               goto err;
 
        if (fsck_err_on(ret == INT_MAX, c,
                        "extent in missing inode:\n  %s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return bch2_btree_delete_at(trans, iter,
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, iter,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               goto out;
+       }
 
-       if (ret == INT_MAX)
-               return 0;
+       if (ret == INT_MAX) {
+               ret = 0;
+               goto out;
+       }
 
-       i = inode->d + ret;
+       i = inode->inodes.data + ret;
        ret = 0;
 
        if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
                        !S_ISLNK(i->inode.bi_mode), c,
                        "extent in non regular inode mode %o:\n  %s",
                        i->inode.bi_mode,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return bch2_btree_delete_at(trans, iter,
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, iter,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               goto out;
+       }
+
+       /*
+        * Check inodes in reverse order, from oldest snapshots to newest, so
+        * that we emit the fewest number of whiteouts necessary:
+        */
+       for (i = inode->inodes.data + inode->inodes.nr - 1;
+            i >= inode->inodes.data;
+            --i) {
+               if (i->snapshot > equiv.snapshot ||
+                   !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+                       continue;
+
+               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                               k.k->type != KEY_TYPE_reservation &&
+                               k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+                               "extent type past end of inode %llu:%u, i_size %llu\n  %s",
+                               i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+                       struct btree_iter iter2;
+
+                       bch2_trans_copy_iter(&iter2, iter);
+                       bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+                       ret =   bch2_btree_iter_traverse(&iter2) ?:
+                               bch2_btree_delete_at(trans, &iter2,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                       bch2_trans_iter_exit(trans, &iter2);
+                       if (ret)
+                               goto err;
 
-       if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
-               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
-                       if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                                       k.k->type != KEY_TYPE_reservation &&
-                                       k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
-                                       "extent type %u offset %llu past end of inode %llu, i_size %llu",
-                                       k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
-                               bch2_fs_lazy_rw(c);
-                               return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-                                               SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
-                                                    k.k->p.snapshot),
-                                               POS(k.k->p.inode, U64_MAX),
-                                               0, NULL) ?: -EINTR;
+                       if (i->snapshot != equiv.snapshot) {
+                               ret = snapshots_seen_add(c, s, i->snapshot);
+                               if (ret)
+                                       goto err;
                        }
                }
        }
 
        if (bkey_extent_is_allocation(k.k))
-               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+               for_each_visible_inode(c, s, inode, equiv.snapshot, i)
                        i->count += k.k->size;
 #if 0
        bch2_bkey_buf_reassemble(&prev, c, k);
 #endif
 
+out:
+err:
 fsck_err:
+       printbuf_exit(&buf);
+
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1268,6 +1310,7 @@ static int check_extents(struct bch_fs *c)
        struct snapshots_seen s;
        struct btree_trans trans;
        struct btree_iter iter;
+       struct bkey_s_c k;
        int ret = 0;
 
 #if 0
@@ -1280,21 +1323,12 @@ static int check_extents(struct bch_fs *c)
 
        bch_verbose(c, "checking extents");
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            POS(BCACHEFS_ROOT_INO, 0),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       do {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_LAZY_RW|
-                                     BTREE_INSERT_NOFAIL,
-                       check_extent(&trans, &iter, &w, &s));
-               if (ret)
-                       break;
-       } while (bch2_btree_iter_advance(&iter));
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+                       POS(BCACHEFS_ROOT_INO, 0),
+                       BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                       NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_extent(&trans, &iter, k, &w, &s));
 #if 0
        bch2_bkey_buf_exit(&prev, c);
 #endif
@@ -1302,6 +1336,8 @@ static int check_extents(struct bch_fs *c)
        bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
 
+       if (ret)
+               bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1309,10 +1345,11 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 {
        struct bch_fs *c = trans->c;
        struct inode_walker_entry *i;
-       int ret = 0, ret2 = 0;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
        s64 count2;
 
-       for (i = w->d; i < w->d + w->nr; i++) {
+       darray_for_each(w->inodes, i) {
                if (i->inode.bi_nlink == i->count)
                        continue;
 
@@ -1335,11 +1372,14 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
                        ret = write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
-                       ret2 = -EINTR;
                }
        }
 fsck_err:
-       return ret ?: ret2;
+       if (ret)
+               bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
+       if (!ret && trans_was_restarted(trans, restart_count))
+               ret = -BCH_ERR_transaction_restart_nested;
+       return ret;
 }
 
 static int check_dirent_target(struct btree_trans *trans,
@@ -1351,7 +1391,7 @@ static int check_dirent_target(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bkey_i_dirent *n;
        bool backpointer_exists = true;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (!target->bi_dir &&
@@ -1377,15 +1417,13 @@ static int check_dirent_target(struct btree_trans *trans,
                                "directory %llu with multiple links",
                                target->bi_inum)) {
                        ret = __remove_dirent(trans, d.k->p);
-                       if (ret)
-                               goto err;
-                       return 0;
+                       goto out;
                }
 
                if (fsck_err_on(backpointer_exists &&
                                !target->bi_nlink, c,
-                               "inode %llu has multiple links but i_nlink 0",
-                               target->bi_inum)) {
+                               "inode %llu type %s has multiple links but i_nlink 0",
+                               target->bi_inum, bch2_d_types[d.v->d_type])) {
                        target->bi_nlink++;
                        target->bi_flags &= ~BCH_INODE_UNLINKED;
 
@@ -1416,18 +1454,19 @@ static int check_dirent_target(struct btree_trans *trans,
                        "incorrect d_type: got %s, should be %s:\n%s",
                        bch2_d_type_str(d.v->d_type),
                        bch2_d_type_str(inode_d_type(target)),
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
                n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
                ret = PTR_ERR_OR_ZERO(n);
                if (ret)
-                       return ret;
+                       goto err;
 
                bkey_reassemble(&n->k_i, d.s_c);
                n->v.d_type = inode_d_type(target);
 
                ret = bch2_trans_update(trans, iter, &n->k_i, 0);
                if (ret)
-                       return ret;
+                       goto err;
 
                d = dirent_i_to_s_c(n);
        }
@@ -1441,94 +1480,110 @@ static int check_dirent_target(struct btree_trans *trans,
                n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
                ret = PTR_ERR_OR_ZERO(n);
                if (ret)
-                       return ret;
+                       goto err;
 
                bkey_reassemble(&n->k_i, d.s_c);
                n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
 
                ret = bch2_trans_update(trans, iter, &n->k_i, 0);
                if (ret)
-                       return ret;
+                       goto err;
 
                d = dirent_i_to_s_c(n);
        }
+out:
 err:
 fsck_err:
+       printbuf_exit(&buf);
+
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
        return ret;
 }
 
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+                       struct bkey_s_c k,
                        struct bch_hash_info *hash_info,
                        struct inode_walker *dir,
                        struct inode_walker *target,
                        struct snapshots_seen *s)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
        struct bkey_s_c_dirent d;
        struct inode_walker_entry *i;
-       char buf[200];
-       int ret;
-
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
+       struct printbuf buf = PRINTBUF;
+       struct bpos equiv;
+       int ret = 0;
 
        ret = check_key_has_snapshot(trans, iter, k);
-       if (ret)
-               return ret < 0 ? ret : 0;
+       if (ret) {
+               ret = ret < 0 ? ret : 0;
+               goto out;
+       }
 
-       ret = snapshots_seen_update(c, s, k.k->p);
+       equiv = k.k->p;
+       equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
        if (ret)
-               return ret;
+               goto err;
 
        if (k.k->type == KEY_TYPE_whiteout)
-               return 0;
+               goto out;
 
        if (dir->cur_inum != k.k->p.inode) {
                ret = check_subdir_count(trans, dir);
                if (ret)
-                       return ret;
+                       goto err;
        }
 
-       ret = __walk_inode(trans, dir, k.k->p);
+       BUG_ON(!iter->path->should_be_locked);
+
+       ret = __walk_inode(trans, dir, equiv);
        if (ret < 0)
-               return ret;
+               goto err;
 
        if (fsck_err_on(ret == INT_MAX, c,
                        "dirent in nonexisting directory:\n%s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return bch2_btree_delete_at(trans, iter,
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, iter,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               goto out;
+       }
 
-       if (ret == INT_MAX)
-               return 0;
+       if (ret == INT_MAX) {
+               ret = 0;
+               goto out;
+       }
 
-       i = dir->d + ret;
+       i = dir->inodes.data + ret;
        ret = 0;
 
        if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
                        "dirent in non directory inode type %s:\n%s",
                        bch2_d_type_str(inode_d_type(&i->inode)),
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return bch2_btree_delete_at(trans, iter, 0);
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, iter, 0);
+               goto out;
+       }
 
        if (dir->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
+               *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 
        ret = hash_check_key(trans, bch2_dirent_hash_desc,
                             hash_info, iter, k);
        if (ret < 0)
-               return ret;
-       if (ret) /* dirent has been deleted */
-               return 0;
+               goto err;
+       if (ret) {
+               /* dirent has been deleted */
+               ret = 0;
+               goto out;
+       }
 
        if (k.k->type != KEY_TYPE_dirent)
-               return 0;
+               goto out;
 
        d = bkey_s_c_to_dirent(k);
 
@@ -1541,24 +1596,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                ret = __subvol_lookup(trans, target_subvol,
                                      &target_snapshot, &target_inum);
                if (ret && ret != -ENOENT)
-                       return ret;
+                       goto err;
 
                if (fsck_err_on(ret, c,
                                "dirent points to missing subvolume %llu",
-                               le64_to_cpu(d.v->d_child_subvol)))
-                       return __remove_dirent(trans, d.k->p);
+                               le64_to_cpu(d.v->d_child_subvol))) {
+                       ret = __remove_dirent(trans, d.k->p);
+                       goto err;
+               }
 
                ret = __lookup_inode(trans, target_inum,
                                   &subvol_root, &target_snapshot);
                if (ret && ret != -ENOENT)
-                       return ret;
+                       goto err;
 
                if (fsck_err_on(ret, c,
                                "subvolume %u points to missing subvolume root %llu",
                                target_subvol,
                                target_inum)) {
                        bch_err(c, "repair not implemented yet");
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
 
                if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
@@ -1568,40 +1626,48 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        subvol_root.bi_subvol = target_subvol;
                        ret = __write_inode(trans, &subvol_root, target_snapshot);
                        if (ret)
-                               return ret;
+                               goto err;
                }
 
                ret = check_dirent_target(trans, iter, d, &subvol_root,
                                          target_snapshot);
                if (ret)
-                       return ret;
+                       goto err;
        } else {
                ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
                if (ret)
-                       return ret;
+                       goto err;
 
-               if (fsck_err_on(!target->nr, c,
-                               "dirent points to missing inode:\n%s",
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      k), buf))) {
+               if (fsck_err_on(!target->inodes.nr, c,
+                               "dirent points to missing inode: (equiv %u)\n%s",
+                               equiv.snapshot,
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, k),
+                                buf.buf))) {
                        ret = __remove_dirent(trans, d.k->p);
                        if (ret)
-                               return ret;
+                               goto err;
                }
 
-               for (i = target->d; i < target->d + target->nr; i++) {
+               darray_for_each(target->inodes, i) {
                        ret = check_dirent_target(trans, iter, d,
                                                  &i->inode, i->snapshot);
                        if (ret)
-                               return ret;
+                               goto err;
                }
        }
 
        if (d.v->d_type == DT_DIR)
-               for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+               for_each_visible_inode(c, s, dir, equiv.snapshot, i)
                        i->count++;
 
+out:
+err:
 fsck_err:
+       printbuf_exit(&buf);
+
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1618,6 +1684,7 @@ static int check_dirents(struct bch_fs *c)
        struct bch_hash_info hash_info;
        struct btree_trans trans;
        struct btree_iter iter;
+       struct bkey_s_c k;
        int ret = 0;
 
        bch_verbose(c, "checking dirents");
@@ -1625,46 +1692,32 @@ static int check_dirents(struct bch_fs *c)
        snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
-                            POS(BCACHEFS_ROOT_INO, 0),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       do {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_LAZY_RW|
-                                     BTREE_INSERT_NOFAIL,
-                       check_dirent(&trans, &iter, &hash_info,
-                                    &dir, &target, &s));
-               if (ret)
-                       break;
-       } while (bch2_btree_iter_advance(&iter));
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+                       POS(BCACHEFS_ROOT_INO, 0),
+                       BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+                       k,
+                       NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
 
        bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
        inode_walker_exit(&dir);
        inode_walker_exit(&target);
+
+       if (ret)
+               bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
        return ret;
 }
 
 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+                      struct bkey_s_c k,
                       struct bch_hash_info *hash_info,
                       struct inode_walker *inode)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
        int ret;
 
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
        ret = check_key_has_snapshot(trans, iter, k);
        if (ret)
                return ret;
@@ -1684,10 +1737,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
        ret = 0;
 
        if (inode->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+               *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
 
        ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1701,30 +1756,25 @@ static int check_xattrs(struct bch_fs *c)
        struct bch_hash_info hash_info;
        struct btree_trans trans;
        struct btree_iter iter;
+       struct bkey_s_c k;
        int ret = 0;
 
        bch_verbose(c, "checking xattrs");
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
-                            POS(BCACHEFS_ROOT_INO, 0),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       do {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_LAZY_RW|
-                                     BTREE_INSERT_NOFAIL,
-                                     check_xattr(&trans, &iter, &hash_info,
-                                                 &inode));
-               if (ret)
-                       break;
-       } while (bch2_btree_iter_advance(&iter));
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+                       POS(BCACHEFS_ROOT_INO, 0),
+                       BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+                       k,
+                       NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_xattr(&trans, &iter, k, &hash_info, &inode));
 
        bch2_trans_exit(&trans);
+
+       if (ret)
+               bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -1751,12 +1801,12 @@ static int check_root_trans(struct btree_trans *trans)
                root_subvol.v.flags     = 0;
                root_subvol.v.snapshot  = cpu_to_le32(snapshot);
                root_subvol.v.inode     = cpu_to_le64(inum);
-               ret = __bch2_trans_do(trans, NULL, NULL,
+               ret = commit_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
                                      BTREE_INSERT_LAZY_RW,
                        __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
                if (ret) {
-                       bch_err(c, "error writing root subvol: %i", ret);
+                       bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
                        goto err;
                }
 
@@ -1775,7 +1825,7 @@ static int check_root_trans(struct btree_trans *trans)
 
                ret = __write_inode(trans, &root_inode, snapshot);
                if (ret)
-                       bch_err(c, "error writing root inode: %i", ret);
+                       bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
        }
 err:
 fsck_err:
@@ -1794,21 +1844,18 @@ static int check_root(struct bch_fs *c)
                check_root_trans(&trans));
 }
 
-struct pathbuf {
-       size_t          nr;
-       size_t          size;
-
-       struct pathbuf_entry {
-               u64     inum;
-               u32     snapshot;
-       }               *entries;
+struct pathbuf_entry {
+       u64     inum;
+       u32     snapshot;
 };
 
-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 {
        struct pathbuf_entry *i;
 
-       for (i = p->entries; i < p->entries + p->nr; i++)
+       darray_for_each(*p, i)
                if (i->inum     == inum &&
                    i->snapshot == snapshot)
                        return true;
@@ -1816,29 +1863,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
        return false;
 }
 
-static int path_down(struct bch_fs *c, struct pathbuf *p,
+static int path_down(struct bch_fs *c, pathbuf *p,
                     u64 inum, u32 snapshot)
 {
-       if (p->nr == p->size) {
-               size_t new_size = max_t(size_t, 256UL, p->size * 2);
-               void *n = krealloc(p->entries,
-                                  new_size * sizeof(p->entries[0]),
-                                  GFP_KERNEL);
-               if (!n) {
-                       bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
-                               new_size);
-                       return -ENOMEM;
-               }
-
-               p->entries = n;
-               p->size = new_size;
-       };
-
-       p->entries[p->nr++] = (struct pathbuf_entry) {
+       int ret = darray_push(p, ((struct pathbuf_entry) {
                .inum           = inum,
                .snapshot       = snapshot,
-       };
-       return 0;
+       }));
+
+       if (ret)
+               bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+                       p->size);
+       return ret;
 }
 
 /*
@@ -1847,14 +1883,14 @@ static int path_down(struct bch_fs *c, struct pathbuf *p,
  * XXX: we should also be verifying that inodes are in the right subvolumes
  */
 static int check_path(struct btree_trans *trans,
-                     struct pathbuf *p,
+                     pathbuf *p,
                      struct bch_inode_unpacked *inode,
                      u32 snapshot)
 {
        struct bch_fs *c = trans->c;
        int ret = 0;
 
-       snapshot = snapshot_t(c, snapshot)->equiv;
+       snapshot = bch2_snapshot_equiv(c, snapshot);
        p->nr = 0;
 
        while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
@@ -1921,14 +1957,14 @@ static int check_path(struct btree_trans *trans,
                        /* XXX print path */
                        bch_err(c, "directory structure loop");
 
-                       for (i = p->entries; i < p->entries + p->nr; i++)
+                       darray_for_each(*p, i)
                                pr_err("%llu:%u", i->inum, i->snapshot);
                        pr_err("%llu:%u", inode->bi_inum, snapshot);
 
                        if (!fsck_err(c, "directory structure loop"))
                                return 0;
 
-                       ret = __bch2_trans_do(trans, NULL, NULL,
+                       ret = commit_do(trans, NULL, NULL,
                                              BTREE_INSERT_NOFAIL|
                                              BTREE_INSERT_LAZY_RW,
                                        remove_backpointer(trans, inode));
@@ -1942,7 +1978,7 @@ static int check_path(struct btree_trans *trans,
        }
 fsck_err:
        if (ret)
-               bch_err(c, "%s: err %i", __func__, ret);
+               bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
        return ret;
 }
 
@@ -1958,7 +1994,7 @@ static int check_directory_structure(struct bch_fs *c)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
-       struct pathbuf path = { 0, 0, NULL };
+       pathbuf path = { 0, };
        int ret;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -1986,9 +2022,7 @@ static int check_directory_structure(struct bch_fs *c)
        }
        bch2_trans_iter_exit(&trans, &iter);
 
-       BUG_ON(ret == -EINTR);
-
-       kfree(path.entries);
+       darray_exit(&path);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -2010,7 +2044,8 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
 {
        if (t->nr == t->size) {
                size_t new_size = max_t(size_t, 128UL, t->size * 2);
-               void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+               void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
                if (!d) {
                        bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
                                new_size);
@@ -2139,7 +2174,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               ret = snapshots_seen_update(c, &s, k.k->p);
+               ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
                if (ret)
                        break;
 
@@ -2151,7 +2186,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                            d.v->d_type != DT_SUBVOL)
                                inc_link(c, &s, links, range_start, range_end,
                                         le64_to_cpu(d.v->d_inum),
-                                        d.k->p.snapshot);
+                                        bch2_snapshot_equiv(c, d.k->p.snapshot));
                        break;
                }
        }
@@ -2165,6 +2200,47 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
        return ret;
 }
 
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+                                    struct bkey_s_c k,
+                                    struct nlink_table *links,
+                                    size_t *idx, u64 range_end)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_inode_unpacked u;
+       struct nlink *link = &links->d[*idx];
+       int ret = 0;
+
+       if (k.k->p.offset >= range_end)
+               return 1;
+
+       if (!bkey_is_inode(k.k))
+               return 0;
+
+       BUG_ON(bch2_inode_unpack(k, &u));
+
+       if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+               return 0;
+
+       if (!u.bi_nlink)
+               return 0;
+
+       while ((cmp_int(link->inum, k.k->p.offset) ?:
+               cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+               BUG_ON(*idx == links->nr);
+               link = &links->d[++*idx];
+       }
+
+       if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+                       "inode %llu type %s has wrong i_nlink (%u, should be %u)",
+                       u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+                       bch2_inode_nlink_get(&u), link->count)) {
+               bch2_inode_nlink_set(&u, link->count);
+               ret = __write_inode(trans, &u, k.k->p.snapshot);
+       }
+fsck_err:
+       return ret;
+}
+
 noinline_for_stack
 static int check_nlinks_update_hardlinks(struct bch_fs *c,
                               struct nlink_table *links,
@@ -2173,56 +2249,25 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bch_inode_unpacked u;
-       struct nlink *link = links->d;
+       size_t idx = 0;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes,
-                          POS(0, range_start),
-                          BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH|
-                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               if (k.k->p.offset >= range_end)
-                       break;
-
-               if (!bkey_is_inode(k.k))
-                       continue;
-
-               BUG_ON(bch2_inode_unpack(k, &u));
-
-               if (S_ISDIR(le16_to_cpu(u.bi_mode)))
-                       continue;
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+                       POS(0, range_start),
+                       BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
 
-               if (!u.bi_nlink)
-                       continue;
-
-               while ((cmp_int(link->inum, k.k->p.offset) ?:
-                       cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-                       link++;
-                       BUG_ON(link >= links->d + links->nr);
-               }
-
-               if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
-                               "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
-                               u.bi_inum, mode_to_type(u.bi_mode),
-                               bch2_inode_nlink_get(&u), link->count)) {
-                       bch2_inode_nlink_set(&u, link->count);
-
-                       ret = write_inode(&trans, &u, k.k->p.snapshot);
-                       if (ret)
-                               bch_err(c, "error in fsck: error %i updating inode", ret);
-               }
-       }
-fsck_err:
-       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
-       if (ret)
+       if (ret < 0) {
                bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+               return ret;
+       }
 
-       return ret;
+       return 0;
 }
 
 noinline_for_stack
@@ -2262,21 +2307,13 @@ static int check_nlinks(struct bch_fs *c)
        return ret;
 }
 
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+                            struct bkey_s_c k)
 {
-       struct bkey_s_c k;
        struct bkey_s_c_reflink_p p;
        struct bkey_i_reflink_p *u;
        int ret;
 
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k)
-               return 0;
-
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
        if (k.k->type != KEY_TYPE_reflink_p)
                return 0;
 
@@ -2312,20 +2349,11 @@ static int fix_reflink_p(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-                          BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH|
-                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               if (k.k->type == KEY_TYPE_reflink_p) {
-                       ret = __bch2_trans_do(&trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL|
-                                             BTREE_INSERT_LAZY_RW,
-                                             fix_reflink_p_key(&trans, &iter));
-                       if (ret)
-                               break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_extents, POS_MIN,
+                       BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+               fix_reflink_p_key(&trans, &iter, k));
 
        bch2_trans_exit(&trans);
        return ret;
@@ -2337,9 +2365,12 @@ static int fix_reflink_p(struct bch_fs *c)
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-       return  bch2_fs_snapshots_check(c) ?:
+       int ret;
+again:
+       ret =   bch2_fs_check_snapshots(c) ?:
+               bch2_fs_check_subvols(c) ?:
+               bch2_delete_dead_snapshots(c) ?:
                check_inodes(c, true) ?:
-               check_subvols(c) ?:
                check_extents(c) ?:
                check_dirents(c) ?:
                check_xattrs(c) ?:
@@ -2347,9 +2378,19 @@ int bch2_fsck_full(struct bch_fs *c)
                check_directory_structure(c) ?:
                check_nlinks(c) ?:
                fix_reflink_p(c);
+
+       if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
+               set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+               goto again;
+       }
+
+       return ret;
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 {
-       return check_inodes(c, false);
+       return  bch2_fs_check_snapshots(c) ?:
+               bch2_fs_check_subvols(c) ?:
+               bch2_delete_dead_snapshots(c) ?:
+               check_inodes(c, false);
 }
index 78e2db6c938b8791aa1c3b52144a156c8973f616..1a0d2608c058662d1f8d1238f5093aa22fd7ce11 100644 (file)
@@ -60,11 +60,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
        return bytes;
 }
 
-void bch2_inode_pack(struct bch_fs *c,
-                    struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+                                          const struct bch_inode_unpacked *inode)
 {
-       struct bkey_i_inode_v2 *k = &packed->inode;
+       struct bkey_i_inode_v3 *k = &packed->inode;
        u8 *out = k->v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
@@ -72,13 +71,17 @@ void bch2_inode_pack(struct bch_fs *c,
        unsigned bytes;
        int ret;
 
-       bkey_inode_v2_init(&packed->inode.k_i);
+       bkey_inode_v3_init(&packed->inode.k_i);
        packed->inode.k.p.offset        = inode->bi_inum;
        packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
        packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
        packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
-       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
-       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+       packed->inode.v.bi_sectors      = cpu_to_le64(inode->bi_sectors);
+       packed->inode.v.bi_size         = cpu_to_le64(inode->bi_size);
+       packed->inode.v.bi_version      = cpu_to_le64(inode->bi_version);
+       SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+       SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
 
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
@@ -99,7 +102,7 @@ void bch2_inode_pack(struct bch_fs *c,
                        *out++ = 0;                                     \
        }
 
-       BCH_INODE_FIELDS()
+       BCH_INODE_FIELDS_v3()
 #undef  x
        BUG_ON(out > end);
 
@@ -110,7 +113,7 @@ void bch2_inode_pack(struct bch_fs *c,
        set_bkey_val_bytes(&packed->inode.k, bytes);
        memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-       SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
+       SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
@@ -120,16 +123,25 @@ void bch2_inode_pack(struct bch_fs *c,
                BUG_ON(ret);
                BUG_ON(unpacked.bi_inum         != inode->bi_inum);
                BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
+               BUG_ON(unpacked.bi_sectors      != inode->bi_sectors);
+               BUG_ON(unpacked.bi_size         != inode->bi_size);
+               BUG_ON(unpacked.bi_version      != inode->bi_version);
                BUG_ON(unpacked.bi_mode         != inode->bi_mode);
 
 #define x(_name, _bits)        if (unpacked._name != inode->_name)             \
                        panic("unpacked %llu should be %llu",           \
                              (u64) unpacked._name, (u64) inode->_name);
-               BCH_INODE_FIELDS()
+               BCH_INODE_FIELDS_v3()
 #undef  x
        }
 }
 
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
+{
+       bch2_inode_pack_inlined(packed, inode);
+}
+
 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
                                struct bch_inode_unpacked *unpacked)
 {
@@ -141,9 +153,9 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 
 #define x(_name, _bits)                                        \
        if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
-               memset(&unpacked->_name, 0,                             \
-                      sizeof(*unpacked) -                              \
-                      offsetof(struct bch_inode_unpacked, _name));     \
+               unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+               memset((void *) unpacked + offset, 0,                   \
+                      sizeof(*unpacked) - offset);                     \
                return 0;                                               \
        }                                                               \
                                                                        \
@@ -157,7 +169,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
        unpacked->_name = field[1];                                     \
        in += ret;
 
-       BCH_INODE_FIELDS()
+       BCH_INODE_FIELDS_v2()
 #undef  x
 
        /* XXX: signal if there were more fields than expected? */
@@ -196,15 +208,66 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
                return -1;                                              \
        fieldnr++;
 
-       BCH_INODE_FIELDS()
+       BCH_INODE_FIELDS_v2()
 #undef  x
 
        /* XXX: signal if there were more fields than expected? */
        return 0;
 }
 
-int bch2_inode_unpack(struct bkey_s_c k,
-                     struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+                               struct bch_inode_unpacked *unpacked)
+{
+       struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+       const u8 *in = inode.v->fields;
+       const u8 *end = bkey_val_end(inode);
+       unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v[2];
+
+       unpacked->bi_inum       = inode.k->p.offset;
+       unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+       unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
+       unpacked->bi_sectors    = le64_to_cpu(inode.v->bi_sectors);
+       unpacked->bi_size       = le64_to_cpu(inode.v->bi_size);
+       unpacked->bi_version    = le64_to_cpu(inode.v->bi_version);
+       unpacked->bi_mode       = INODEv3_MODE(inode.v);
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < nr_fields) {                                      \
+               ret = bch2_varint_decode_fast(in, end, &v[0]);          \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+                                                                       \
+               if (_bits > 64) {                                       \
+                       ret = bch2_varint_decode_fast(in, end, &v[1]);  \
+                       if (ret < 0)                                    \
+                               return ret;                             \
+                       in += ret;                                      \
+               } else {                                                \
+                       v[1] = 0;                                       \
+               }                                                       \
+       } else {                                                        \
+               v[0] = v[1] = 0;                                        \
+       }                                                               \
+                                                                       \
+       unpacked->_name = v[0];                                         \
+       if (v[1] || v[0] != unpacked->_name)                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_INODE_FIELDS_v3()
+#undef  x
+
+       /* XXX: signal if there were more fields than expected? */
+       return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+                                              struct bch_inode_unpacked *unpacked)
 {
        switch (k.k->type) {
        case KEY_TYPE_inode: {
@@ -243,6 +306,14 @@ int bch2_inode_unpack(struct bkey_s_c k,
        }
 }
 
+int bch2_inode_unpack(struct bkey_s_c k,
+                     struct bch_inode_unpacked *unpacked)
+{
+       if (likely(k.k->type == KEY_TYPE_inode_v3))
+               return bch2_inode_unpack_v3(k, unpacked);
+       return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
 int bch2_inode_peek(struct btree_trans *trans,
                    struct btree_iter *iter,
                    struct bch_inode_unpacked *inode,
@@ -288,124 +359,192 @@ int bch2_inode_write(struct btree_trans *trans,
        if (IS_ERR(inode_p))
                return PTR_ERR(inode_p);
 
-       bch2_inode_pack(trans->c, inode_p, inode);
+       bch2_inode_pack_inlined(inode_p, inode);
        inode_p->inode.k.p.snapshot = iter->snapshot;
        return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 }
 
-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k)
 {
-       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-       struct bch_inode_unpacked unpacked;
+       struct bch_inode_unpacked u;
+       struct bkey_inode_buf *inode_p;
+       int ret;
+
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return bkey_s_c_err(PTR_ERR(inode_p));
+
+       ret = bch2_inode_unpack(k, &u);
+       if (ret)
+               return bkey_s_c_err(ret);
 
-       if (k.k->p.inode)
-               return "nonzero k.p.inode";
+       bch2_inode_pack(inode_p, &u);
+       return bkey_i_to_s_c(&inode_p->inode.k_i);
+}
 
-       if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-               return "incorrect value size";
+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+{
+       struct bch_inode_unpacked unpacked;
 
-       if (k.k->p.offset < BLOCKDEV_INODE_MAX)
-               return "fs inode in blockdev range";
+       if (k.k->p.inode) {
+               prt_printf(err, "nonzero k.p.inode");
+               return -EINVAL;
+       }
 
-       if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-               return "invalid str hash type";
+       if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
+               prt_printf(err, "fs inode in blockdev range");
+               return -EINVAL;
+       }
 
-       if (bch2_inode_unpack(k, &unpacked))
-               return "invalid variable length fields";
+       if (bch2_inode_unpack(k, &unpacked)) {
+               prt_printf(err, "invalid variable length fields");
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-               return "invalid data checksum type";
+       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
+               prt_printf(err, "invalid data checksum type (%u >= %u",
+                       unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-               return "invalid data checksum type";
+       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
+               prt_printf(err, "invalid data checksum type (%u >= %u)",
+                      unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+               return -EINVAL;
+       }
 
        if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-           unpacked.bi_nlink != 0)
-               return "flagged as unlinked but bi_nlink != 0";
+           unpacked.bi_nlink != 0) {
+               prt_printf(err, "flagged as unlinked but bi_nlink != 0");
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
-               return "subvolume root but not a directory";
+       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
+               prt_printf(err, "subvolume root but not a directory");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
-const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                      int rw, struct printbuf *err)
 {
-       struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-       struct bch_inode_unpacked unpacked;
+       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
-       if (k.k->p.inode)
-               return "nonzero k.p.inode";
+       if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*inode.v));
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-               return "incorrect value size";
+       if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+               prt_printf(err, "invalid str hash type (%llu >= %u)",
+                      INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+               return -EINVAL;
+       }
 
-       if (k.k->p.offset < BLOCKDEV_INODE_MAX)
-               return "fs inode in blockdev range";
+       return __bch2_inode_invalid(k, err);
+}
 
-       if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-               return "invalid str hash type";
+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
+{
+       struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
-       if (bch2_inode_unpack(k, &unpacked))
-               return "invalid variable length fields";
+       if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*inode.v));
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-               return "invalid data checksum type";
+       if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+               prt_printf(err, "invalid str hash type (%llu >= %u)",
+                      INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-               return "invalid data checksum type";
+       return __bch2_inode_invalid(k, err);
+}
 
-       if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-           unpacked.bi_nlink != 0)
-               return "flagged as unlinked but bi_nlink != 0";
+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
+{
+       struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+
+       if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*inode.v));
+               return -EINVAL;
+       }
+
+       if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+           INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
+               prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
+                      INODEv3_FIELDS_START(inode.v),
+                      INODEv3_FIELDS_START_INITIAL,
+                      bkey_val_u64s(inode.k));
+               return -EINVAL;
+       }
 
-       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
-               return "subvolume root but not a directory";
+       if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+               prt_printf(err, "invalid str hash type (%llu >= %u)",
+                      INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+               return -EINVAL;
+       }
 
-       return NULL;
+       return __bch2_inode_invalid(k, err);
 }
 
-static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+                                         struct bch_inode_unpacked *inode)
 {
-       pr_buf(out, "mode %o flags %x journal_seq %llu",
+       prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
               inode->bi_mode, inode->bi_flags,
-              inode->bi_journal_seq);
+              inode->bi_journal_seq,
+              inode->bi_size,
+              inode->bi_sectors,
+              inode->bi_version);
 
 #define x(_name, _bits)                                                \
-       pr_buf(out, " "#_name " %llu", (u64) inode->_name);
-       BCH_INODE_FIELDS()
+       prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+       BCH_INODE_FIELDS_v3()
 #undef  x
 }
 
 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-       pr_buf(out, "inum: %llu ", inode->bi_inum);
+       prt_printf(out, "inum: %llu ", inode->bi_inum);
        __bch2_inode_unpacked_to_text(out, inode);
 }
 
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
-                      struct bkey_s_c k)
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
        struct bch_inode_unpacked inode;
 
        if (bch2_inode_unpack(k, &inode)) {
-               pr_buf(out, "(unpack error)");
+               prt_printf(out, "(unpack error)");
                return;
        }
 
        __bch2_inode_unpacked_to_text(out, &inode);
 }
 
-const char *bch2_inode_generation_invalid(const struct bch_fs *c,
-                                         struct bkey_s_c k)
+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                                 int rw, struct printbuf *err)
 {
-       if (k.k->p.inode)
-               return "nonzero k.p.inode";
+       if (k.k->p.inode) {
+               prt_printf(err, "nonzero k.p.inode");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
-               return "incorrect value size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -413,7 +552,7 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
 
-       pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+       prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
 void bch2_inode_init_early(struct bch_fs *c,
@@ -549,7 +688,7 @@ again:
        }
 
        if (!ret && start == min)
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_inode_create;
 
        if (ret) {
                bch2_trans_iter_exit(trans, iter);
@@ -606,12 +745,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
                bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-               k = bch2_btree_iter_peek(&iter);
+               k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
                ret = bkey_err(k);
                if (ret)
                        goto err;
 
-               if (!k.k || iter.pos.inode != inum.inum)
+               if (!k.k)
                        break;
 
                bkey_init(&delete.k);
@@ -621,7 +760,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
                      bch2_trans_commit(trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
 err:
-               if (ret && ret != -EINTR)
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        break;
        }
 
@@ -692,7 +831,7 @@ retry:
                                BTREE_INSERT_NOFAIL);
 err:
        bch2_trans_iter_exit(&trans, &iter);
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -718,3 +857,36 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
        return bch2_trans_do(c, NULL, NULL, 0,
                bch2_inode_find_by_inum_trans(&trans, inum, inode));
 }
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+       if (bi->bi_flags & BCH_INODE_UNLINKED)
+               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+       else {
+               if (bi->bi_nlink == U32_MAX)
+                       return -EINVAL;
+
+               bi->bi_nlink++;
+       }
+
+       return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+       if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+               bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+                                       bi->bi_inum);
+               return;
+       }
+
+       if (bi->bi_flags & BCH_INODE_UNLINKED) {
+               bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+               return;
+       }
+
+       if (bi->bi_nlink)
+               bi->bi_nlink--;
+       else
+               bi->bi_flags |= BCH_INODE_UNLINKED;
+}
index 77957cc7f9dda3eac49a9bd435969c72184c6545..2915f4f96f4bb3c2fc8f2190ffad7fe86a8d2408 100644 (file)
@@ -2,34 +2,47 @@
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H
 
+#include "bkey.h"
 #include "opts.h"
 
 extern const char * const bch2_inode_opts[];
 
-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {                \
        .key_invalid    = bch2_inode_invalid,           \
        .val_to_text    = bch2_inode_to_text,           \
+       .trans_trigger  = bch2_trans_mark_inode,        \
+       .atomic_trigger = bch2_mark_inode,              \
 }
 
 #define bch2_bkey_ops_inode_v2 (struct bkey_ops) {     \
        .key_invalid    = bch2_inode_v2_invalid,        \
        .val_to_text    = bch2_inode_to_text,           \
+       .trans_trigger  = bch2_trans_mark_inode,        \
+       .atomic_trigger = bch2_mark_inode,              \
+}
+
+#define bch2_bkey_ops_inode_v3 (struct bkey_ops) {     \
+       .key_invalid    = bch2_inode_v3_invalid,        \
+       .val_to_text    = bch2_inode_to_text,           \
+       .trans_trigger  = bch2_trans_mark_inode,        \
+       .atomic_trigger = bch2_mark_inode,              \
 }
 
 static inline bool bkey_is_inode(const struct bkey *k)
 {
        return  k->type == KEY_TYPE_inode ||
-               k->type == KEY_TYPE_inode_v2;
+               k->type == KEY_TYPE_inode_v2 ||
+               k->type == KEY_TYPE_inode_v3;
 }
 
-const char *bch2_inode_generation_invalid(const struct bch_fs *,
-                                         struct bkey_s_c);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
-                                  struct bkey_s_c);
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+                                 int, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation (struct bkey_ops) {     \
        .key_invalid    = bch2_inode_generation_invalid,        \
@@ -48,25 +61,28 @@ struct bch_inode_unpacked {
        u64                     bi_inum;
        u64                     bi_journal_seq;
        __le64                  bi_hash_seed;
+       u64                     bi_size;
+       u64                     bi_sectors;
+       u64                     bi_version;
        u32                     bi_flags;
        u16                     bi_mode;
 
 #define x(_name, _bits)        u##_bits _name;
-       BCH_INODE_FIELDS()
+       BCH_INODE_FIELDS_v3()
 #undef  x
 };
 
 struct bkey_inode_buf {
-       struct bkey_i_inode_v2  inode;
+       struct bkey_i_inode_v3  inode;
 
 #define x(_name, _bits)                + 8 + _bits / 8
-       u8              _pad[0 + BCH_INODE_FIELDS()];
+       u8              _pad[0 + BCH_INODE_FIELDS_v3()];
 #undef  x
 } __attribute__((packed, aligned(8)));
 
-void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
-                    const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
@@ -161,23 +177,6 @@ static inline unsigned nlink_bias(umode_t mode)
        return S_ISDIR(mode) ? 2 : 1;
 }
 
-static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
-       if (bi->bi_flags & BCH_INODE_UNLINKED)
-               bi->bi_flags &= ~BCH_INODE_UNLINKED;
-       else
-               bi->bi_nlink++;
-}
-
-static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
-{
-       BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
-       if (bi->bi_nlink)
-               bi->bi_nlink--;
-       else
-               bi->bi_flags |= BCH_INODE_UNLINKED;
-}
-
 static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
 {
        return bi->bi_flags & BCH_INODE_UNLINKED
@@ -197,4 +196,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
        }
 }
 
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
 #endif /* _BCACHEFS_INODE_H */
index 10f8b3aedc3cf4f580de2a89cc314ade08047539..5971569e31336043ce4bd69dda9b8028b703b6df 100644 (file)
@@ -242,8 +242,7 @@ int bch2_extent_update(struct btree_trans *trans,
                       s64 *i_sectors_delta_total,
                       bool check_enospc)
 {
-       struct btree_iter inode_iter;
-       struct bch_inode_unpacked inode_u;
+       struct btree_iter inode_iter = { NULL };
        struct bpos next_pos;
        bool usage_increasing;
        s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -283,36 +282,71 @@ int bch2_extent_update(struct btree_trans *trans,
                        return ret;
        }
 
-       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
-                             BTREE_ITER_INTENT);
-       if (ret)
-               return ret;
+       if (new_i_size || i_sectors_delta) {
+               struct bkey_s_c k;
+               struct bkey_s_c_inode_v3 inode;
+               struct bkey_i_inode_v3 *new_inode;
+               bool i_size_update;
+
+               bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
+                                    SPOS(0, inum.inum, iter->snapshot),
+                                    BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+               k = bch2_btree_iter_peek_slot(&inode_iter);
+               ret = bkey_err(k);
+               if (unlikely(ret))
+                       goto err;
+
+               ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
+               if (unlikely(ret))
+                       goto err;
+
+               if (unlikely(k.k->type != KEY_TYPE_inode_v3)) {
+                       k = bch2_inode_to_v3(trans, k);
+                       ret = bkey_err(k);
+                       if (unlikely(ret))
+                               goto err;
+               }
+
+               inode = bkey_s_c_to_inode_v3(k);
+               i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+                       new_i_size > le64_to_cpu(inode.v->bi_size);
 
-       if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-           new_i_size > inode_u.bi_size)
-               inode_u.bi_size = new_i_size;
+               if (!i_sectors_delta && !i_size_update)
+                       goto no_inode_update;
 
-       inode_u.bi_sectors += i_sectors_delta;
+               new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               ret = PTR_ERR_OR_ZERO(new_inode);
+               if (unlikely(ret))
+                       goto err;
 
+               bkey_reassemble(&new_inode->k_i, k);
+
+               if (i_size_update)
+                       new_inode->v.bi_size = cpu_to_le64(new_i_size);
+
+               le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+               ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0);
+               if (unlikely(ret))
+                       goto err;
+       }
+no_inode_update:
        ret =   bch2_trans_update(trans, iter, k, 0) ?:
-               bch2_inode_write(trans, &inode_iter, &inode_u) ?:
                bch2_trans_commit(trans, disk_res, journal_seq,
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL);
-       bch2_trans_iter_exit(trans, &inode_iter);
-
-       if (ret)
-               return ret;
+       if (unlikely(ret))
+               goto err;
 
        if (i_sectors_delta_total)
                *i_sectors_delta_total += i_sectors_delta;
        bch2_btree_iter_set_pos(iter, next_pos);
-
-       return 0;
+err:
+       bch2_trans_iter_exit(trans, &inode_iter);
+       return ret;
 }
 
 /*
- * Returns -EINTR if we had to drop locks:
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
  */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
                   subvol_inum inum, u64 end,
@@ -325,7 +359,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
        int ret = 0, ret2 = 0;
        u32 snapshot;
 
-       while (!ret || ret == -EINTR) {
+       while (!ret ||
+              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
@@ -384,14 +419,16 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
        bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
-       return ret == -EINTR ? 0 : ret;
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               ret = 0;
+
+       return ret;
 }
 
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct bkey_buf sk;
-       struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
@@ -415,7 +452,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
                                                  &sk.k->k.p.snapshot);
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
                if (ret)
                        break;
@@ -430,14 +467,11 @@ int bch2_write_index_default(struct bch_write_op *op)
                                         op->flags & BCH_WRITE_CHECK_ENOSPC);
                bch2_trans_iter_exit(&trans, &iter);
 
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
                if (ret)
                        break;
 
-               if (ec_ob)
-                       bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
-
                if (bkey_cmp(iter.pos, k->k.p) >= 0)
                        bch2_keylist_pop_front(&op->insert_keys);
                else
@@ -470,8 +504,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                ca = bch_dev_bkey_exists(c, ptr->dev);
 
                if (to_entry(ptr + 1) < ptrs.end) {
-                       n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
-                                                  &ca->replica_set));
+                       n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+                                               GFP_NOIO, &ca->replica_set));
 
                        n->bio.bi_end_io        = wbio->bio.bi_end_io;
                        n->bio.bi_private       = wbio->bio.bi_private;
@@ -531,17 +565,11 @@ static void bch2_write_done(struct closure *cl)
        }
 }
 
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 {
-       struct bch_fs *c = op->c;
        struct keylist *keys = &op->insert_keys;
        struct bch_extent_ptr *ptr;
-       struct bkey_i *src, *dst = keys->keys, *n, *k;
-       unsigned dev;
-       int ret;
+       struct bkey_i *src, *dst = keys->keys, *n;
 
        for (src = keys->keys; src != keys->top; src = n) {
                n = bkey_next(src);
@@ -550,10 +578,8 @@ static void __bch2_write_index(struct bch_write_op *op)
                        bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
                                            test_bit(ptr->dev, op->failed.d));
 
-                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
-                               ret = -EIO;
-                               goto err;
-                       }
+                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+                               return -EIO;
                }
 
                if (dst != src)
@@ -562,6 +588,25 @@ static void __bch2_write_index(struct bch_write_op *op)
        }
 
        keys->top = dst;
+       return 0;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct keylist *keys = &op->insert_keys;
+       struct bkey_i *k;
+       unsigned dev;
+       int ret;
+
+       if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+               ret = bch2_write_drop_io_error_ptrs(op);
+               if (ret)
+                       goto err;
+       }
 
        /*
         * probably not the ideal place to hook this in, but I don't
@@ -580,14 +625,14 @@ static void __bch2_write_index(struct bch_write_op *op)
                u64 sectors_start = keylist_sectors(keys);
                int ret = op->index_update_fn(op);
 
-               BUG_ON(ret == -EINTR);
+               BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
                BUG_ON(keylist_sectors(keys) && !ret);
 
                op->written += sectors_start - keylist_sectors(keys);
 
                if (ret) {
                        bch_err_inum_ratelimited(c, op->pos.inode,
-                               "write error %i from btree update", ret);
+                               "write error while doing btree update: %s", bch2_err_str(ret));
                        op->error = ret;
                }
        }
@@ -636,8 +681,10 @@ static void bch2_write_endio(struct bio *bio)
                                    op->pos.inode,
                                    op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
                                    "data write error: %s",
-                              bch2_blk_status_to_str(bio->bi_status)))
+                                   bch2_blk_status_to_str(bio->bi_status))) {
                set_bit(wbio->dev, op->failed.d);
+               op->flags |= BCH_WRITE_IO_ERROR;
+       }
 
        if (wbio->have_ioref) {
                bch2_latency_acct(ca, wbio->submit_time, WRITE);
@@ -701,7 +748,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 
        pages = min(pages, BIO_MAX_VECS);
 
-       bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+       bio = bio_alloc_bioset(NULL, pages, 0,
+                              GFP_NOIO, &c->bio_write);
        wbio                    = wbio_init(bio);
        wbio->put_bio           = true;
        /* copy WRITE_SYNC flag */
@@ -764,6 +812,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
        struct bch_fs *c = op->c;
        struct nonce nonce = extent_nonce(op->version, op->crc);
        struct bch_csum csum;
+       int ret;
 
        if (!bch2_csum_type_is_encryption(op->crc.csum_type))
                return 0;
@@ -778,10 +827,10 @@ static int bch2_write_decrypt(struct bch_write_op *op)
        if (bch2_crc_cmp(op->crc.csum, csum))
                return -EIO;
 
-       bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+       ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
        op->crc.csum_type = 0;
        op->crc.csum = (struct bch_csum) { 0, 0 };
-       return 0;
+       return ret;
 }
 
 static enum prep_encoded_ret {
@@ -911,8 +960,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
        saved_iter = dst->bi_iter;
 
        do {
-               struct bch_extent_crc_unpacked crc =
-                       (struct bch_extent_crc_unpacked) { 0 };
+               struct bch_extent_crc_unpacked crc = { 0 };
                struct bversion version = op->version;
                size_t dst_len, src_len;
 
@@ -964,6 +1012,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                    !crc_is_compressed(crc) &&
                    bch2_csum_type_is_encryption(op->crc.csum_type) ==
                    bch2_csum_type_is_encryption(op->csum_type)) {
+                       u8 compression_type = crc.compression_type;
+                       u16 nonce = crc.nonce;
                        /*
                         * Note: when we're using rechecksum(), we need to be
                         * checksumming @src because it has all the data our
@@ -982,6 +1032,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                                        bio_sectors(src) - (src_len >> 9),
                                        op->csum_type))
                                goto csum_err;
+                       /*
+                        * rchecksum_bio sets compression_type on crc from op->crc,
+                        * this isn't always correct as sometimes we're changing
+                        * an extent from uncompressed to incompressible.
+                        */
+                       crc.compression_type = compression_type;
+                       crc.nonce = nonce;
                } else {
                        if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
                            bch2_rechecksum_bio(c, src, version, op->crc,
@@ -996,8 +1053,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                        crc.live_size           = src_len >> 9;
 
                        swap(dst->bi_iter.bi_size, dst_len);
-                       bch2_encrypt_bio(c, op->csum_type,
-                                        extent_nonce(version, crc), dst);
+                       ret = bch2_encrypt_bio(c, op->csum_type,
+                                              extent_nonce(version, crc), dst);
+                       if (ret)
+                               goto err;
+
                        crc.csum = bch2_checksum_bio(c, op->csum_type,
                                         extent_nonce(version, crc), dst);
                        crc.csum_type = op->csum_type;
@@ -1038,8 +1098,7 @@ do_write:
        *_dst = dst;
        return more;
 csum_err:
-       bch_err(c, "error verifying existing checksum while "
-               "rewriting existing data (memory corruption?)");
+       bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
        ret = -EIO;
 err:
        if (to_wbio(dst)->bounce)
@@ -1055,7 +1114,7 @@ static void __bch2_write(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bch_fs *c = op->c;
        struct write_point *wp;
-       struct bio *bio;
+       struct bio *bio = NULL;
        bool skip_put = true;
        unsigned nofs_flags;
        int ret;
@@ -1080,12 +1139,6 @@ again:
                                        BKEY_EXTENT_U64s_MAX))
                        goto flush_io;
 
-               if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
-                   percpu_ref_is_dying(&c->writes)) {
-                       ret = -EROFS;
-                       goto err;
-               }
-
                /*
                 * The copygc thread is now global, which means it's no longer
                 * freeing up space on specific disks, which means that
@@ -1104,8 +1157,8 @@ again:
                                      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
                EBUG_ON(!wp);
 
-               if (unlikely(IS_ERR(wp))) {
-                       if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+               if (IS_ERR(wp)) {
+                       if (unlikely(wp != ERR_PTR(-EAGAIN))) {
                                ret = PTR_ERR(wp);
                                goto err;
                        }
@@ -1279,11 +1332,12 @@ void bch2_write(struct closure *cl)
        }
 
        if (c->opts.nochanges ||
-           !percpu_ref_tryget(&c->writes)) {
+           !percpu_ref_tryget_live(&c->writes)) {
                op->error = -EROFS;
                goto err;
        }
 
+       this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
        bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
        data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -1319,7 +1373,7 @@ struct promote_op {
        struct rhash_head       hash;
        struct bpos             pos;
 
-       struct migrate_write    write;
+       struct data_update      write;
        struct bio_vec          bi_inline_vecs[0]; /* must be last */
 };
 
@@ -1375,17 +1429,16 @@ static void promote_done(struct closure *cl)
        bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
                               op->start_time);
 
-       bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+       bch2_data_update_exit(&op->write);
        promote_free(c, op);
 }
 
 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 {
-       struct bch_fs *c = rbio->c;
        struct closure *cl = &op->cl;
        struct bio *bio = &op->write.op.wbio.bio;
 
-       trace_promote(&rbio->bio);
+       trace_and_count(op->write.op.c, read_promote, &rbio->bio);
 
        /* we now own pages: */
        BUG_ON(!rbio->bounce);
@@ -1395,10 +1448,8 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
               sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
        swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 
-       bch2_migrate_read_done(&op->write, rbio);
-
        closure_init(cl, NULL);
-       closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
+       bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
        closure_return_with_destructor(cl, promote_done);
 }
 
@@ -1416,7 +1467,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
        unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
        int ret;
 
-       if (!percpu_ref_tryget(&c->writes))
+       if (!percpu_ref_tryget_live(&c->writes))
                return NULL;
 
        op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
@@ -1437,7 +1488,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                goto err;
 
        rbio_init(&(*rbio)->bio, opts);
-       bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
+       bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
 
        if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
                                 GFP_NOIO))
@@ -1452,15 +1503,15 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                goto err;
 
        bio = &op->write.op.wbio.bio;
-       bio_init(bio, bio->bi_inline_vecs, pages);
+       bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-       ret = bch2_migrate_write_init(c, &op->write,
+       ret = bch2_data_update_init(c, &op->write,
                        writepoint_hashed((unsigned long) current),
                        opts,
-                       DATA_PROMOTE,
-                       (struct data_opts) {
+                       (struct data_update_opts) {
                                .target         = opts.promote_target,
-                               .nr_replicas    = 1,
+                               .extra_replicas = 1,
+                               .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
                        },
                        btree_id, k);
        BUG_ON(ret);
@@ -1653,7 +1704,7 @@ static void bch2_rbio_retry(struct work_struct *work)
        };
        struct bch_io_failures failed = { .nr = 0 };
 
-       trace_read_retry(&rbio->bio);
+       trace_and_count(c, read_retry, &rbio->bio);
 
        if (rbio->retry == READ_RETRY_AVOID)
                bch2_mark_io_failure(&failed, &rbio->pick);
@@ -1772,6 +1823,7 @@ static void __bch2_read_endio(struct work_struct *work)
        struct nonce nonce = extent_nonce(rbio->version, crc);
        unsigned nofs_flags;
        struct bch_csum csum;
+       int ret;
 
        nofs_flags = memalloc_nofs_save();
 
@@ -1806,7 +1858,10 @@ static void __bch2_read_endio(struct work_struct *work)
        crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
 
        if (crc_is_compressed(crc)) {
-               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
                if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
                        goto decompression_err;
        } else {
@@ -1817,7 +1872,9 @@ static void __bch2_read_endio(struct work_struct *work)
                BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
                src->bi_iter.bi_size = dst_iter.bi_size;
 
-               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
 
                if (rbio->bounce) {
                        struct bvec_iter src_iter = src->bi_iter;
@@ -1830,7 +1887,10 @@ static void __bch2_read_endio(struct work_struct *work)
                 * Re encrypt data we decrypted, so it's consistent with
                 * rbio->crc:
                 */
-               bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
                promote_start(rbio->promote, rbio);
                rbio->promote = NULL;
        }
@@ -1855,9 +1915,9 @@ csum_err:
        }
 
        bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
-               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
                rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-               csum.hi, csum.lo, crc.csum_type);
+               csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
        bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
        goto out;
 decompression_err:
@@ -1865,6 +1925,11 @@ decompression_err:
                                 "decompression error");
        bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
        goto out;
+decrypt_err:
+       bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+                                "decrypt error");
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       goto out;
 }
 
 static void bch2_read_endio(struct bio *bio)
@@ -1895,7 +1960,7 @@ static void bch2_read_endio(struct bio *bio)
 
        if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
            ptr_stale(ca, &rbio->pick.ptr)) {
-               atomic_long_inc(&c->read_realloc_races);
+               trace_and_count(c, read_reuse_race, &rbio->bio);
 
                if (rbio->flags & BCH_READ_RETRY_IF_STALE)
                        bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@@ -1905,6 +1970,7 @@ static void bch2_read_endio(struct bio *bio)
        }
 
        if (rbio->narrow_crcs ||
+           rbio->promote ||
            crc_is_compressed(rbio->pick.crc) ||
            bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
                context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
@@ -1960,24 +2026,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
        struct btree_iter iter;
-       char buf[200];
+       struct printbuf buf = PRINTBUF;
        int ret;
 
-       bch2_bkey_val_to_text(&PBUF(buf), c, k);
-       bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
-
        bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+                            PTR_BUCKET_POS(c, &ptr),
                             BTREE_ITER_CACHED);
 
+       prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+       printbuf_indent_add(&buf, 2);
+       prt_newline(&buf);
+
+       bch2_bkey_val_to_text(&buf, c, k);
+       prt_newline(&buf);
+
+       prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
        ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-       if (ret)
-               return;
+       if (!ret) {
+               prt_newline(&buf);
+               bch2_bkey_val_to_text(&buf, c, k);
+       }
+
+       bch2_fs_inconsistent(c, "%s", buf.buf);
 
-       bch2_bkey_val_to_text(&PBUF(buf), c, k);
-       bch_err(c, "%s", buf);
-       bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
 }
 
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
@@ -2021,7 +2095,14 @@ retry_pick:
 
        ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 
-       if (!pick.ptr.cached &&
+       /*
+        * Stale dirty pointers are treated as IO errors, but @failed isn't
+        * allocated unless we're in the retry path - so if we're not in the
+        * retry path, don't check here, it'll be caught in bch2_read_endio()
+        * and we'll end up in the retry path:
+        */
+       if ((flags & BCH_READ_IN_RETRY) &&
+           !pick.ptr.cached &&
            unlikely(ptr_stale(ca, &pick.ptr))) {
                read_from_stale_dirty_pointer(trans, k, pick.ptr);
                bch2_mark_io_failure(failed, &pick);
@@ -2105,8 +2186,10 @@ get_bio:
        } else if (bounce) {
                unsigned sectors = pick.crc.compressed_size;
 
-               rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
+               rbio = rbio_init(bio_alloc_bioset(NULL,
                                                  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+                                                 0,
+                                                 GFP_NOIO,
                                                  &c->bio_read_split),
                                 orig->opts);
 
@@ -2122,8 +2205,8 @@ get_bio:
                 * from the whole bio, in which case we don't want to retry and
                 * lose the error)
                 */
-               rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
-                                               &c->bio_read_split),
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+                                                &c->bio_read_split),
                                 orig->opts);
                rbio->bio.bi_iter = iter;
                rbio->split     = true;
@@ -2165,8 +2248,9 @@ get_bio:
        rbio->bio.bi_end_io     = bch2_read_endio;
 
        if (rbio->bounce)
-               trace_read_bounce(&rbio->bio);
+               trace_and_count(c, read_bounce, &rbio->bio);
 
+       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
        bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
        /*
@@ -2179,7 +2263,7 @@ get_bio:
 
        if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
                bio_inc_remaining(&orig->bio);
-               trace_read_split(&orig->bio);
+               trace_and_count(c, read_split, &orig->bio);
        }
 
        if (!rbio->pick.idx) {
@@ -2288,10 +2372,9 @@ retry:
                 * read_extent -> io_time_reset may cause a transaction restart
                 * without returning an error, we need to check for that here:
                 */
-               if (!bch2_trans_relock(&trans)) {
-                       ret = -EINTR;
+               ret = bch2_trans_relock(&trans);
+               if (ret)
                        break;
-               }
 
                bch2_btree_iter_set_pos(&iter,
                                POS(inum.inum, bvec_iter.bi_sector));
@@ -2345,7 +2428,9 @@ retry:
 err:
        bch2_trans_iter_exit(&trans, &iter);
 
-       if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+           ret == READ_RETRY ||
+           ret == READ_RETRY_AVOID)
                goto retry;
 
        bch2_trans_exit(&trans);
index 1aa422dccef7de794d3b65155d8d7cc28fac8bd7..3ae31758a01ee8b86bc441ea493a96d8187efc4a 100644 (file)
@@ -40,6 +40,7 @@ enum bch_write_flags {
        BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 10),
        BCH_WRITE_SKIP_CLOSURE_PUT      = (1 << 11),
        BCH_WRITE_DONE                  = (1 << 12),
+       BCH_WRITE_IO_ERROR              = (1 << 13),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -50,7 +51,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
-       return op->alloc_reserve == RESERVE_MOVINGGC
+       return op->alloc_reserve == RESERVE_movinggc
                ? op->c->copygc_wq
                : op->c->btree_update_wq;
 }
@@ -79,7 +80,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
        op->compression_type    = bch2_compression_opt_to_type[opts.compression];
        op->nr_replicas         = 0;
        op->nr_replicas_required = c->opts.data_replicas_required;
-       op->alloc_reserve       = RESERVE_NONE;
+       op->alloc_reserve       = RESERVE_none;
        op->incompressible      = 0;
        op->open_buckets.nr     = 0;
        op->devs_have.nr        = 0;
index 158df42e5e10487caca016cf52478ab5377e5152..95c29229d3fe658c6ff9e58361bdf0b2c125e50a 100644 (file)
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-#include "super-io.h"
 
 #include <trace/events/bcachefs.h>
 
-static u64 last_unwritten_seq(struct journal *j)
-{
-       union journal_res_state s = READ_ONCE(j->reservations);
+#define x(n)   #n,
+static const char * const bch2_journal_watermarks[] = {
+       JOURNAL_WATERMARKS()
+       NULL
+};
 
-       lockdep_assert_held(&j->lock);
-
-       return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
-}
+static const char * const bch2_journal_errors[] = {
+       JOURNAL_ERRORS()
+       NULL
+};
+#undef x
 
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
-       return seq >= last_unwritten_seq(j);
+       return seq > j->seq_ondisk;
 }
 
 static bool __journal_entry_is_open(union journal_res_state state)
@@ -39,6 +42,11 @@ static bool __journal_entry_is_open(union journal_res_state state)
        return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 }
 
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+       return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
 static bool journal_entry_is_open(struct journal *j)
 {
        return __journal_entry_is_open(j->reservations);
@@ -50,8 +58,6 @@ journal_seq_to_buf(struct journal *j, u64 seq)
        struct journal_buf *buf = NULL;
 
        EBUG_ON(seq > journal_cur_seq(j));
-       EBUG_ON(seq == journal_cur_seq(j) &&
-               j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
 
        if (journal_seq_unwritten(j, seq)) {
                buf = j->buf + (seq & JOURNAL_BUF_MASK);
@@ -69,54 +75,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
        p->devs.nr = 0;
 }
 
-static void journal_pin_new_entry(struct journal *j)
-{
-       /*
-        * The fifo_push() needs to happen at the same time as j->seq is
-        * incremented for journal_last_seq() to be calculated correctly
-        */
-       atomic64_inc(&j->seq);
-       journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-}
-
-static void bch2_journal_buf_init(struct journal *j)
-{
-       struct journal_buf *buf = journal_cur_buf(j);
-
-       bkey_extent_init(&buf->key);
-       buf->noflush    = false;
-       buf->must_flush = false;
-       buf->separate_flush = false;
-
-       memset(buf->data, 0, sizeof(*buf->data));
-       buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
-       buf->data->u64s = 0;
-}
-
-void bch2_journal_halt(struct journal *j)
-{
-       union journal_res_state old, new;
-       u64 v = atomic64_read(&j->reservations.counter);
-
-       do {
-               old.v = new.v = v;
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return;
-
-               new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
-                                      old.v, new.v)) != old.v);
-
-       /*
-        * XXX: we're not using j->lock here because this can be called from
-        * interrupt context, this can race with journal_write_done()
-        */
-       if (!j->err_seq)
-               j->err_seq = journal_cur_seq(j);
-       journal_wake(j);
-       closure_wake_up(&journal_cur_buf(j)->wait);
-}
-
 /* journal entry close/open: */
 
 void __bch2_journal_buf_put(struct journal *j)
@@ -132,7 +90,7 @@ void __bch2_journal_buf_put(struct journal *j)
  * We don't close a journal_buf until the next journal_buf is finished writing,
  * and can be opened again - this also initializes the next journal_buf:
  */
-static bool __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf = journal_cur_buf(j);
@@ -140,34 +98,24 @@ static bool __journal_entry_close(struct journal *j)
        u64 v = atomic64_read(&j->reservations.counter);
        unsigned sectors;
 
+       BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+              closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
        lockdep_assert_held(&j->lock);
 
        do {
                old.v = new.v = v;
-               if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-                       return true;
+               new.cur_entry_offset = closed_val;
 
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
-                       /* this entry will never be written: */
-                       closure_wake_up(&buf->wait);
-                       return true;
-               }
-
-               if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-                       set_bit(JOURNAL_NEED_WRITE, &j->flags);
-                       j->need_write_time = local_clock();
-               }
-
-               new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
-               new.idx++;
-
-               if (new.idx == new.unwritten_idx)
-                       return false;
-
-               BUG_ON(journal_state_count(new, new.idx));
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+                   old.cur_entry_offset == new.cur_entry_offset)
+                       return;
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
+       if (!__journal_entry_is_open(old))
+               return;
+
        /* Close out old buffer: */
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
@@ -197,36 +145,42 @@ static bool __journal_entry_close(struct journal *j)
         */
        buf->last_seq           = journal_last_seq(j);
        buf->data->last_seq     = cpu_to_le64(buf->last_seq);
+       BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
        __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
-       /* Initialize new buffer: */
-       journal_pin_new_entry(j);
-
-       bch2_journal_buf_init(j);
-
        cancel_delayed_work(&j->write_work);
-       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
        bch2_journal_space_available(j);
 
        bch2_journal_buf_put(j, old.idx);
-       return true;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+       spin_lock(&j->lock);
+       __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+       if (!j->err_seq)
+               j->err_seq = journal_cur_seq(j);
+       spin_unlock(&j->lock);
 }
 
 static bool journal_entry_want_write(struct journal *j)
 {
-       union journal_res_state s = READ_ONCE(j->reservations);
-       bool ret = false;
+       bool ret = !journal_entry_is_open(j) ||
+               journal_cur_seq(j) == journal_last_unwritten_seq(j);
 
-       /*
-        * Don't close it yet if we already have a write in flight, but do set
-        * NEED_WRITE:
-        */
-       if (s.idx != s.unwritten_idx)
-               set_bit(JOURNAL_NEED_WRITE, &j->flags);
-       else
-               ret = __journal_entry_close(j);
+       /* Don't close it yet if we already have a write in flight: */
+       if (ret)
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+       else if (nr_unwritten_journal_entries(j)) {
+               struct journal_buf *buf = journal_cur_buf(j);
+
+               if (!buf->flush_time) {
+                       buf->flush_time = local_clock() ?: 1;
+                       buf->expires = jiffies;
+               }
+       }
 
        return ret;
 }
@@ -255,34 +209,71 @@ static bool journal_entry_close(struct journal *j)
 static int journal_entry_open(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf = journal_cur_buf(j);
+       struct journal_buf *buf = j->buf +
+               ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
        union journal_res_state old, new;
        int u64s;
        u64 v;
 
-       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
        lockdep_assert_held(&j->lock);
        BUG_ON(journal_entry_is_open(j));
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
        if (j->blocked)
-               return cur_entry_blocked;
+               return JOURNAL_ERR_blocked;
 
        if (j->cur_entry_error)
                return j->cur_entry_error;
 
+       if (bch2_journal_error(j))
+               return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+       if (!fifo_free(&j->pin))
+               return JOURNAL_ERR_journal_pin_full;
+
+       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
+               return JOURNAL_ERR_max_in_flight;
+
        BUG_ON(!j->cur_entry_sectors);
 
+       buf->expires            =
+               (journal_cur_seq(j) == j->flushed_seq_ondisk
+                ? jiffies
+                : j->last_flush_write) +
+               msecs_to_jiffies(c->opts.journal_flush_delay);
+
        buf->u64s_reserved      = j->entry_u64s_reserved;
        buf->disk_sectors       = j->cur_entry_sectors;
        buf->sectors            = min(buf->disk_sectors, buf->buf_size >> 9);
 
        u64s = (int) (buf->sectors << 9) / sizeof(u64) -
                journal_entry_overhead(j);
-       u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+       u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
-       if (u64s <= le32_to_cpu(buf->data->u64s))
-               return cur_entry_journal_full;
+       if (u64s <= 0)
+               return JOURNAL_ERR_journal_full;
+
+       if (fifo_empty(&j->pin) && j->reclaim_thread)
+               wake_up_process(j->reclaim_thread);
+
+       /*
+        * The fifo_push() needs to happen at the same time as j->seq is
+        * incremented for journal_last_seq() to be calculated correctly
+        */
+       atomic64_inc(&j->seq);
+       journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+       BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+       bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
+       buf->separate_flush = false;
+       buf->flush_time = 0;
+
+       memset(buf->data, 0, sizeof(*buf->data));
+       buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
+       buf->data->u64s = 0;
 
        /*
         * Must be set before marking the journal entry as open:
@@ -293,14 +284,14 @@ static int journal_entry_open(struct journal *j)
        do {
                old.v = new.v = v;
 
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return cur_entry_insufficient_devices;
+               BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
 
-               /* Handle any already added entries */
-               new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+               new.idx++;
+               BUG_ON(journal_state_count(new, new.idx));
+               BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
 
-               EBUG_ON(journal_state_count(new, new.idx));
                journal_state_inc(&new);
+               new.cur_entry_offset = 0;
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
@@ -318,8 +309,7 @@ static int journal_entry_open(struct journal *j)
 
 static bool journal_quiesced(struct journal *j)
 {
-       union journal_res_state s = READ_ONCE(j->reservations);
-       bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
+       bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 
        if (!ret)
                journal_entry_close(j);
@@ -334,8 +324,21 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
        struct journal *j = container_of(work, struct journal, write_work.work);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       long delta;
+
+       spin_lock(&j->lock);
+       if (!__journal_entry_is_open(j->reservations))
+               goto unlock;
+
+       delta = journal_cur_buf(j)->expires - jiffies;
 
-       journal_entry_close(j);
+       if (delta > 0)
+               mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+       else
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+       spin_unlock(&j->lock);
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
@@ -364,13 +367,12 @@ retry:
                return 0;
        }
 
-       if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-           !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+       if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
                /*
                 * Don't want to close current journal entry, just need to
                 * invoke reclaim:
                 */
-               ret = cur_entry_journal_full;
+               ret = JOURNAL_ERR_journal_full;
                goto unlock;
        }
 
@@ -385,23 +387,16 @@ retry:
            buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
                j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-       if (journal_entry_is_open(j) &&
-           !__journal_entry_close(j)) {
-               /*
-                * We failed to get a reservation on the current open journal
-                * entry because it's full, and we can't close it because
-                * there's still a previous one in flight:
-                */
-               trace_journal_entry_full(c);
-               ret = cur_entry_blocked;
-       } else {
-               ret = journal_entry_open(j);
-       }
+       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+       ret = journal_entry_open(j);
+
+       if (ret == JOURNAL_ERR_max_in_flight)
+               trace_and_count(c, journal_entry_full, c);
 unlock:
-       if ((ret && ret != cur_entry_insufficient_devices) &&
+       if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
            !j->res_get_blocked_start) {
                j->res_get_blocked_start = local_clock() ?: 1;
-               trace_journal_full(c);
+               trace_and_count(c, journal_full, c);
        }
 
        can_discard = j->can_discard;
@@ -410,23 +405,24 @@ unlock:
        if (!ret)
                goto retry;
 
-       if ((ret == cur_entry_journal_full ||
-            ret == cur_entry_journal_pin_full) &&
+       if ((ret == JOURNAL_ERR_journal_full ||
+            ret == JOURNAL_ERR_journal_pin_full) &&
            !can_discard &&
-           j->reservations.idx == j->reservations.unwritten_idx &&
-           (flags & JOURNAL_RES_GET_RESERVED)) {
-               char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-
-               bch_err(c, "Journal stuck!");
-               if (journal_debug_buf) {
-                       bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-                       bch_err(c, "%s", journal_debug_buf);
-
-                       bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
-                       bch_err(c, "Journal pins:\n%s", journal_debug_buf);
-                       kfree(journal_debug_buf);
-               }
+           !nr_unwritten_journal_entries(j) &&
+           (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
+               struct printbuf buf = PRINTBUF;
+
+               bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
+                       bch2_journal_errors[ret]);
+
+               bch2_journal_debug_to_text(&buf, j);
+               bch_err(c, "%s", buf.buf);
 
+               printbuf_reset(&buf);
+               bch2_journal_pins_to_text(&buf, j);
+               bch_err(c, "Journal pins:\n%s", buf.buf);
+
+               printbuf_exit(&buf);
                bch2_fatal_error(c);
                dump_stack();
        }
@@ -435,8 +431,8 @@ unlock:
         * Journal is full - can't rely on reclaim from work item due to
         * freezing:
         */
-       if ((ret == cur_entry_journal_full ||
-            ret == cur_entry_journal_pin_full) &&
+       if ((ret == JOURNAL_ERR_journal_full ||
+            ret == JOURNAL_ERR_journal_pin_full) &&
            !(flags & JOURNAL_RES_GET_NONBLOCK)) {
                if (can_discard) {
                        bch2_journal_do_discards(j);
@@ -449,7 +445,7 @@ unlock:
                }
        }
 
-       return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
+       return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
 }
 
 /*
@@ -528,7 +524,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
                /*
                 * Not enough room in current journal entry, have to flush it:
                 */
-               __journal_entry_close(j);
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
        } else {
                journal_cur_buf(j)->u64s_reserved += d;
        }
@@ -573,12 +569,15 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
        }
 
        /* if seq was written, but not flushed - flush a newer one instead */
-       seq = max(seq, last_unwritten_seq(j));
+       seq = max(seq, journal_last_unwritten_seq(j));
 
 recheck_need_open:
-       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+       if (seq > journal_cur_seq(j)) {
                struct journal_res res = { 0 };
 
+               if (journal_entry_is_open(j))
+                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
                spin_unlock(&j->lock);
 
                ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
@@ -588,7 +587,11 @@ recheck_need_open:
                seq = res.seq;
                buf = j->buf + (seq & JOURNAL_BUF_MASK);
                buf->must_flush = true;
-               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+               if (!buf->flush_time) {
+                       buf->flush_time = local_clock() ?: 1;
+                       buf->expires = jiffies;
+               }
 
                if (parent && !closure_wait(&buf->wait, parent))
                        BUG();
@@ -640,69 +643,18 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        return ret ?: ret2 < 0 ? ret2 : 0;
 }
 
-int bch2_journal_meta(struct journal *j)
-{
-       struct journal_buf *buf;
-       struct journal_res res;
-       int ret;
-
-       memset(&res, 0, sizeof(res));
-
-       ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-       if (ret)
-               return ret;
-
-       buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
-       buf->must_flush = true;
-       set_bit(JOURNAL_NEED_WRITE, &j->flags);
-
-       bch2_journal_res_put(j, &res);
-
-       return bch2_journal_flush_seq(j, res.seq);
-}
-
 /*
  * bch2_journal_flush_async - if there is an open journal entry, or a journal
  * still being written, write it and wait for the write to complete
  */
 void bch2_journal_flush_async(struct journal *j, struct closure *parent)
 {
-       u64 seq, journal_seq;
-
-       spin_lock(&j->lock);
-       journal_seq = journal_cur_seq(j);
-
-       if (journal_entry_is_open(j)) {
-               seq = journal_seq;
-       } else if (journal_seq) {
-               seq = journal_seq - 1;
-       } else {
-               spin_unlock(&j->lock);
-               return;
-       }
-       spin_unlock(&j->lock);
-
-       bch2_journal_flush_seq_async(j, seq, parent);
+       bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
 }
 
 int bch2_journal_flush(struct journal *j)
 {
-       u64 seq, journal_seq;
-
-       spin_lock(&j->lock);
-       journal_seq = journal_cur_seq(j);
-
-       if (journal_entry_is_open(j)) {
-               seq = journal_seq;
-       } else if (journal_seq) {
-               seq = journal_seq - 1;
-       } else {
-               spin_unlock(&j->lock);
-               return 0;
-       }
-       spin_unlock(&j->lock);
-
-       return bch2_journal_flush_seq(j, seq);
+       return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
 }
 
 /*
@@ -725,13 +677,13 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
        if (seq <= c->journal.flushed_seq_ondisk)
                goto out;
 
-       for (unwritten_seq = last_unwritten_seq(j);
+       for (unwritten_seq = journal_last_unwritten_seq(j);
             unwritten_seq < seq;
             unwritten_seq++) {
                struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 
                /* journal write is already in flight, and was a flush write: */
-               if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+               if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
                        goto out;
 
                buf->noflush = true;
@@ -743,6 +695,64 @@ out:
        return ret;
 }
 
+int bch2_journal_meta(struct journal *j)
+{
+       struct journal_buf *buf;
+       struct journal_res res;
+       int ret;
+
+       memset(&res, 0, sizeof(res));
+
+       ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+       if (ret)
+               return ret;
+
+       buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+       buf->must_flush = true;
+
+       if (!buf->flush_time) {
+               buf->flush_time = local_clock() ?: 1;
+               buf->expires = jiffies;
+       }
+
+       bch2_journal_res_put(j, &res);
+
+       return bch2_journal_flush_seq(j, res.seq);
+}
+
+int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
+{
+       struct jset_entry_log *entry;
+       struct journal_res res = { 0 };
+       unsigned msglen, u64s;
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       msglen = vsnprintf(NULL, 0, fmt, args) + 1;
+       va_end(args);
+
+       u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
+
+       ret = bch2_journal_res_get(j, &res, u64s, 0);
+       if (ret)
+               return ret;
+
+       entry = container_of(journal_res_entry(j, &res),
+                            struct jset_entry_log, entry);
+       memset(entry, 0, u64s * sizeof(u64));
+       entry->entry.type = BCH_JSET_ENTRY_log;
+       entry->entry.u64s = u64s - 1;
+
+       va_start(args, fmt);
+       vsnprintf(entry->d, INT_MAX, fmt, args);
+       va_end(args);
+
+       bch2_journal_res_put(j, &res);
+
+       return bch2_journal_flush_seq(j, res.seq);
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
@@ -770,28 +780,55 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 {
        struct bch_fs *c = ca->fs;
        struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets;
        u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+       struct open_bucket **ob = NULL;
+       long *bu = NULL;
+       unsigned i, nr_got = 0, nr_want = nr - ja->nr;
+       unsigned old_nr                 = ja->nr;
+       unsigned old_discard_idx        = ja->discard_idx;
+       unsigned old_dirty_idx_ondisk   = ja->dirty_idx_ondisk;
+       unsigned old_dirty_idx          = ja->dirty_idx;
+       unsigned old_cur_idx            = ja->cur_idx;
        int ret = 0;
 
-       /* don't handle reducing nr of buckets yet: */
-       if (nr <= ja->nr)
-               return 0;
+       if (c) {
+               bch2_journal_flush_all_pins(&c->journal);
+               bch2_journal_block(&c->journal);
+       }
 
-       new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
-       new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
-       if (!new_buckets || !new_bucket_seq) {
+       bu              = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+       ob              = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+       new_buckets     = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+       new_bucket_seq  = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+       if (!bu || !ob || !new_buckets || !new_bucket_seq) {
                ret = -ENOMEM;
-               goto err;
+               goto err_unblock;
        }
 
-       journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-                                       nr + sizeof(*journal_buckets) / sizeof(u64));
-       if (!journal_buckets) {
-               ret = -ENOSPC;
-               goto err;
+       for (nr_got = 0; nr_got < nr_want; nr_got++) {
+               if (new_fs) {
+                       bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+                       if (bu[nr_got] < 0) {
+                               ret = -BCH_ERR_ENOSPC_bucket_alloc;
+                               break;
+                       }
+               } else {
+                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
+                                              false, cl);
+                       if (IS_ERR(ob[nr_got])) {
+                               ret = cl
+                                       ? -EAGAIN
+                                       : -BCH_ERR_ENOSPC_bucket_alloc;
+                               break;
+                       }
+
+                       bu[nr_got] = ob[nr_got]->bucket;
+               }
        }
 
+       if (!nr_got)
+               goto err_unblock;
+
        /*
         * We may be called from the device add path, before the new device has
         * actually been added to the running filesystem:
@@ -804,51 +841,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        swap(new_buckets,       ja->buckets);
        swap(new_bucket_seq,    ja->bucket_seq);
 
-       if (!new_fs)
-               spin_unlock(&c->journal.lock);
-
-       while (ja->nr < nr) {
-               struct open_bucket *ob = NULL;
-               unsigned pos;
-               long b;
-
-               if (new_fs) {
-                       b = bch2_bucket_alloc_new_fs(ca);
-                       if (b < 0) {
-                               ret = -ENOSPC;
-                               goto err;
-                       }
-               } else {
-                       rcu_read_lock();
-                       ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
-                                              false, cl);
-                       rcu_read_unlock();
-                       if (IS_ERR(ob)) {
-                               ret = cl ? -EAGAIN : -ENOSPC;
-                               goto err;
-                       }
-
-                       b = ob->bucket;
-               }
-
-               if (c)
-                       spin_lock(&c->journal.lock);
-
-               /*
-                * XXX
-                * For resize at runtime, we should be writing the new
-                * superblock before inserting into the journal array
-                */
+       for (i = 0; i < nr_got; i++) {
+               unsigned pos = ja->discard_idx ?: ja->nr;
+               long b = bu[i];
 
-               pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
                __array_insert_item(ja->buckets,                ja->nr, pos);
                __array_insert_item(ja->bucket_seq,             ja->nr, pos);
-               __array_insert_item(journal_buckets->buckets,   ja->nr, pos);
                ja->nr++;
 
                ja->buckets[pos] = b;
                ja->bucket_seq[pos] = 0;
-               journal_buckets->buckets[pos] = cpu_to_le64(b);
 
                if (pos <= ja->discard_idx)
                        ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -858,29 +860,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
                if (pos <= ja->cur_idx)
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+       }
 
-               if (c)
-                       spin_unlock(&c->journal.lock);
+       ret = bch2_journal_buckets_to_sb(c, ca);
+       if (ret) {
+               /* Revert: */
+               swap(new_buckets,       ja->buckets);
+               swap(new_bucket_seq,    ja->bucket_seq);
+               ja->nr                  = old_nr;
+               ja->discard_idx         = old_discard_idx;
+               ja->dirty_idx_ondisk    = old_dirty_idx_ondisk;
+               ja->dirty_idx           = old_dirty_idx;
+               ja->cur_idx             = old_cur_idx;
+       }
 
-               if (!new_fs) {
-                       ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
-                                               b, BCH_DATA_journal,
-                                               ca->mi.bucket_size));
+       if (!new_fs)
+               spin_unlock(&c->journal.lock);
 
-                       bch2_open_bucket_put(c, ob);
+       if (c)
+               bch2_journal_unblock(&c->journal);
+
+       if (ret)
+               goto err;
 
-                       if (ret)
+       if (!new_fs) {
+               for (i = 0; i < nr_got; i++) {
+                       ret = bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               bu[i], BCH_DATA_journal,
+                                               ca->mi.bucket_size));
+                       if (ret) {
+                               bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
                                goto err;
+                       }
                }
        }
 err:
-       bch2_sb_resize_journal(&ca->disk_sb,
-               ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+       if (ob && !new_fs)
+               for (i = 0; i < nr_got; i++)
+                       bch2_open_bucket_put(c, ob[i]);
+
        kfree(new_bucket_seq);
        kfree(new_buckets);
+       kfree(ob);
+       kfree(bu);
 
        return ret;
+err_unblock:
+       if (c)
+               bch2_journal_unblock(&c->journal);
+       goto err;
 }
 
 /*
@@ -893,11 +922,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
        struct journal_device *ja = &ca->journal;
        struct closure cl;
        unsigned current_nr;
-       int ret;
+       int ret = 0;
+
+       /* don't handle reducing nr of buckets yet: */
+       if (nr < ja->nr)
+               return 0;
 
        closure_init_stack(&cl);
 
-       do {
+       while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
                struct disk_reservation disk_res = { 0, 0 };
 
                closure_sync(&cl);
@@ -912,10 +945,11 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                 * reservation to ensure we'll actually be able to allocate:
                 */
 
-               if (bch2_disk_reservation_get(c, &disk_res,
-                                             bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+               ret = bch2_disk_reservation_get(c, &disk_res,
+                                               bucket_to_sector(ca, nr - ja->nr), 1, 0);
+               if (ret) {
                        mutex_unlock(&c->sb_lock);
-                       return -ENOSPC;
+                       return ret;
                }
 
                ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
@@ -925,7 +959,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                if (ja->nr != current_nr)
                        bch2_write_super(c);
                mutex_unlock(&c->sb_lock);
-       } while (ret == -EAGAIN);
+       }
 
        return ret;
 }
@@ -933,6 +967,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
        unsigned nr;
+       int ret;
 
        if (dynamic_fault("bcachefs:add:journal_alloc"))
                return -ENOMEM;
@@ -949,24 +984,31 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
                     min(1 << 13,
                         (1 << 24) / ca->mi.bucket_size));
 
-       return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+       if (ca->fs)
+               mutex_lock(&ca->fs->sb_lock);
+
+       ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+
+       if (ca->fs)
+               mutex_unlock(&ca->fs->sb_lock);
+
+       return ret;
 }
 
 /* startup/shutdown: */
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 {
-       union journal_res_state state;
        bool ret = false;
-       unsigned i;
+       u64 seq;
 
        spin_lock(&j->lock);
-       state = READ_ONCE(j->reservations);
-       i = state.idx;
+       for (seq = journal_last_unwritten_seq(j);
+            seq <= journal_cur_seq(j) && !ret;
+            seq++) {
+               struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-       while (i != state.unwritten_idx) {
-               i = (i - 1) & JOURNAL_BUF_MASK;
-               if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+               if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
                        ret = true;
        }
        spin_unlock(&j->lock);
@@ -981,6 +1023,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 
 void bch2_fs_journal_stop(struct journal *j)
 {
+       bch2_journal_reclaim_stop(j);
        bch2_journal_flush_all_pins(j);
 
        wait_event(j->wait, journal_entry_close(j));
@@ -995,24 +1038,30 @@ void bch2_fs_journal_stop(struct journal *j)
 
        BUG_ON(!bch2_journal_error(j) &&
               test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
-              (journal_entry_is_open(j) ||
-               j->last_empty_seq + 1 != journal_cur_seq(j)));
+              j->last_empty_seq != journal_cur_seq(j));
 
        cancel_delayed_work_sync(&j->write_work);
-       bch2_journal_reclaim_stop(j);
 }
 
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
-                         struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_entry_pin_list *p;
-       struct journal_replay *i;
+       struct journal_replay *i, **_i;
+       struct genradix_iter iter;
+       bool had_entries = false;
+       unsigned ptr;
        u64 last_seq = cur_seq, nr, seq;
 
-       if (!list_empty(journal_entries))
-               last_seq = le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
+       genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               last_seq = le64_to_cpu(i->j.last_seq);
+               break;
+       }
 
        nr = cur_seq - last_seq;
 
@@ -1029,18 +1078,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        j->replay_journal_seq_end = cur_seq;
        j->last_seq_ondisk      = last_seq;
        j->flushed_seq_ondisk   = cur_seq - 1;
+       j->seq_ondisk           = cur_seq - 1;
        j->pin.front            = last_seq;
        j->pin.back             = cur_seq;
        atomic64_set(&j->seq, cur_seq - 1);
 
-       if (list_empty(journal_entries))
-               j->last_empty_seq = cur_seq - 1;
-
        fifo_for_each_entry_ptr(p, &j->pin, seq)
                journal_pin_list_init(p, 1);
 
-       list_for_each_entry(i, journal_entries, list) {
-               unsigned ptr;
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
 
                seq = le64_to_cpu(i->j.seq);
                BUG_ON(seq >= cur_seq);
@@ -1056,9 +1106,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                p->devs.nr = 0;
                for (ptr = 0; ptr < i->nr_ptrs; ptr++)
                        bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+               had_entries = true;
        }
 
-       if (list_empty(journal_entries))
+       if (!had_entries)
                j->last_empty_seq = cur_seq;
 
        spin_lock(&j->lock);
@@ -1066,11 +1118,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        set_bit(JOURNAL_STARTED, &j->flags);
        j->last_flush_write = jiffies;
 
-       journal_pin_new_entry(j);
-
        j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-
-       bch2_journal_buf_init(j);
+       j->reservations.unwritten_idx++;
 
        c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
@@ -1098,25 +1147,49 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
        struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal *journal_buckets =
                bch2_sb_get_journal(sb);
-       unsigned i;
+       struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+               bch2_sb_get_journal_v2(sb);
+       unsigned i, nr_bvecs;
+
+       ja->nr = 0;
 
-       ja->nr = bch2_nr_journal_buckets(journal_buckets);
+       if (journal_buckets_v2) {
+               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+               for (i = 0; i < nr; i++)
+                       ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+       } else if (journal_buckets) {
+               ja->nr = bch2_nr_journal_buckets(journal_buckets);
+       }
 
        ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->bucket_seq)
                return -ENOMEM;
 
-       ca->journal.bio = bio_kmalloc(GFP_KERNEL,
-                       DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
+       nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+       ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
        if (!ca->journal.bio)
                return -ENOMEM;
 
+       bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
        ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->buckets)
                return -ENOMEM;
 
-       for (i = 0; i < ja->nr; i++)
-               ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+       if (journal_buckets_v2) {
+               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+               unsigned j, dst = 0;
+
+               for (i = 0; i < nr; i++)
+                       for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+                               ja->buckets[dst++] =
+                                       le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+       } else if (journal_buckets) {
+               for (i = 0; i < ja->nr; i++)
+                       ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+       }
 
        return 0;
 }
@@ -1182,68 +1255,94 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        union journal_res_state s;
        struct bch_dev *ca;
        unsigned long now = jiffies;
+       u64 seq;
        unsigned i;
 
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 24);
+       out->atomic++;
+
        rcu_read_lock();
        s = READ_ONCE(j->reservations);
 
-       pr_buf(out, "active journal entries:\t%llu\n",  fifo_used(&j->pin));
-       pr_buf(out, "seq:\t\t\t%llu\n",                 journal_cur_seq(j));
-       pr_buf(out, "last_seq:\t\t%llu\n",              journal_last_seq(j));
-       pr_buf(out, "last_seq_ondisk:\t%llu\n",         j->last_seq_ondisk);
-       pr_buf(out, "flushed_seq_ondisk:\t%llu\n",      j->flushed_seq_ondisk);
-       pr_buf(out, "prereserved:\t\t%u/%u\n",          j->prereserved.reserved, j->prereserved.remaining);
-       pr_buf(out, "each entry reserved:\t%u\n",       j->entry_u64s_reserved);
-       pr_buf(out, "nr flush writes:\t%llu\n",         j->nr_flush_writes);
-       pr_buf(out, "nr noflush writes:\t%llu\n",       j->nr_noflush_writes);
-       pr_buf(out, "nr direct reclaim:\t%llu\n",       j->nr_direct_reclaim);
-       pr_buf(out, "nr background reclaim:\t%llu\n",   j->nr_background_reclaim);
-       pr_buf(out, "reclaim kicked:\t\t%u\n",          j->reclaim_kicked);
-       pr_buf(out, "reclaim runs in:\t%u ms\n",        time_after(j->next_reclaim, now)
+       prt_printf(out, "dirty journal entries:\t%llu/%llu\n",  fifo_used(&j->pin), j->pin.size);
+       prt_printf(out, "seq:\t\t\t%llu\n",                     journal_cur_seq(j));
+       prt_printf(out, "seq_ondisk:\t\t%llu\n",                j->seq_ondisk);
+       prt_printf(out, "last_seq:\t\t%llu\n",          journal_last_seq(j));
+       prt_printf(out, "last_seq_ondisk:\t%llu\n",             j->last_seq_ondisk);
+       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",  j->flushed_seq_ondisk);
+       prt_printf(out, "prereserved:\t\t%u/%u\n",              j->prereserved.reserved, j->prereserved.remaining);
+       prt_printf(out, "watermark:\t\t%s\n",           bch2_journal_watermarks[j->watermark]);
+       prt_printf(out, "each entry reserved:\t%u\n",   j->entry_u64s_reserved);
+       prt_printf(out, "nr flush writes:\t%llu\n",             j->nr_flush_writes);
+       prt_printf(out, "nr noflush writes:\t%llu\n",   j->nr_noflush_writes);
+       prt_printf(out, "nr direct reclaim:\t%llu\n",   j->nr_direct_reclaim);
+       prt_printf(out, "nr background reclaim:\t%llu\n",       j->nr_background_reclaim);
+       prt_printf(out, "reclaim kicked:\t\t%u\n",              j->reclaim_kicked);
+       prt_printf(out, "reclaim runs in:\t%u ms\n",    time_after(j->next_reclaim, now)
               ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-       pr_buf(out, "current entry sectors:\t%u\n",     j->cur_entry_sectors);
-       pr_buf(out, "current entry error:\t%u\n",       j->cur_entry_error);
-       pr_buf(out, "current entry:\t\t");
+       prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+       prt_printf(out, "current entry error:\t%s\n",   bch2_journal_errors[j->cur_entry_error]);
+       prt_printf(out, "current entry:\t\t");
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
-               pr_buf(out, "error\n");
+               prt_printf(out, "error");
                break;
        case JOURNAL_ENTRY_CLOSED_VAL:
-               pr_buf(out, "closed\n");
+               prt_printf(out, "closed");
                break;
        default:
-               pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+               prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
                break;
        }
 
-       pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
+       prt_newline(out);
+
+       for (seq = journal_cur_seq(j);
+            seq >= journal_last_unwritten_seq(j);
+            --seq) {
+               i = seq & JOURNAL_BUF_MASK;
+
+               prt_printf(out, "unwritten entry:");
+               prt_tab(out);
+               prt_printf(out, "%llu", seq);
+               prt_newline(out);
+               printbuf_indent_add(out, 2);
 
-       i = s.idx;
-       while (i != s.unwritten_idx) {
-               i = (i - 1) & JOURNAL_BUF_MASK;
+               prt_printf(out, "refcount:");
+               prt_tab(out);
+               prt_printf(out, "%u", journal_state_count(s, i));
+               prt_newline(out);
 
-               pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
-                      i, journal_state_count(s, i), j->buf[i].sectors);
+               prt_printf(out, "sectors:");
+               prt_tab(out);
+               prt_printf(out, "%u", j->buf[i].sectors);
+               prt_newline(out);
+
+               prt_printf(out, "expires");
+               prt_tab(out);
+               prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+               prt_newline(out);
+
+               printbuf_indent_sub(out, 2);
        }
 
-       pr_buf(out,
-              "need write:\t\t%i\n"
+       prt_printf(out,
               "replay done:\t\t%i\n",
-              test_bit(JOURNAL_NEED_WRITE,     &j->flags),
               test_bit(JOURNAL_REPLAY_DONE,    &j->flags));
 
-       pr_buf(out, "space:\n");
-       pr_buf(out, "\tdiscarded\t%u:%u\n",
+       prt_printf(out, "space:\n");
+       prt_printf(out, "\tdiscarded\t%u:%u\n",
               j->space[journal_space_discarded].next_entry,
               j->space[journal_space_discarded].total);
-       pr_buf(out, "\tclean ondisk\t%u:%u\n",
+       prt_printf(out, "\tclean ondisk\t%u:%u\n",
               j->space[journal_space_clean_ondisk].next_entry,
               j->space[journal_space_clean_ondisk].total);
-       pr_buf(out, "\tclean\t\t%u:%u\n",
+       prt_printf(out, "\tclean\t\t%u:%u\n",
               j->space[journal_space_clean].next_entry,
               j->space[journal_space_clean].total);
-       pr_buf(out, "\ttotal\t\t%u:%u\n",
+       prt_printf(out, "\ttotal\t\t%u:%u\n",
               j->space[journal_space_total].next_entry,
               j->space[journal_space_total].total);
 
@@ -1257,17 +1356,19 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                if (!ja->nr)
                        continue;
 
-               pr_buf(out, "dev %u:\n",                i);
-               pr_buf(out, "\tnr\t\t%u\n",             ja->nr);
-               pr_buf(out, "\tbucket size\t%u\n",      ca->mi.bucket_size);
-               pr_buf(out, "\tavailable\t%u:%u\n",     bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
-               pr_buf(out, "\tdiscard_idx\t%u\n",      ja->discard_idx);
-               pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,    ja->bucket_seq[ja->dirty_idx_ondisk]);
-               pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,              ja->bucket_seq[ja->dirty_idx]);
-               pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,                ja->bucket_seq[ja->cur_idx]);
+               prt_printf(out, "dev %u:\n",            i);
+               prt_printf(out, "\tnr\t\t%u\n",         ja->nr);
+               prt_printf(out, "\tbucket size\t%u\n",  ca->mi.bucket_size);
+               prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+               prt_printf(out, "\tdiscard_idx\t%u\n",  ja->discard_idx);
+               prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,        ja->bucket_seq[ja->dirty_idx_ondisk]);
+               prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,          ja->bucket_seq[ja->dirty_idx]);
+               prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,            ja->bucket_seq[ja->cur_idx]);
        }
 
        rcu_read_unlock();
+
+       --out->atomic;
 }
 
 void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
@@ -1277,27 +1378,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        spin_unlock(&j->lock);
 }
 
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *pin;
-       u64 i;
 
        spin_lock(&j->lock);
-       fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-               pr_buf(out, "%llu: count %u\n",
-                      i, atomic_read(&pin_list->count));
+       *seq = max(*seq, j->pin.front);
 
-               list_for_each_entry(pin, &pin_list->list, list)
-                       pr_buf(out, "\t%px %ps\n",
-                              pin, pin->flush);
+       if (*seq >= j->pin.back) {
+               spin_unlock(&j->lock);
+               return true;
+       }
+
+       out->atomic++;
+
+       pin_list = journal_seq_pin(j, *seq);
+
+       prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
 
-               if (!list_empty(&pin_list->flushed))
-                       pr_buf(out, "flushed:\n");
+       list_for_each_entry(pin, &pin_list->list, list) {
+               prt_printf(out, "\t%px %ps", pin, pin->flush);
+               prt_newline(out);
+       }
+
+       list_for_each_entry(pin, &pin_list->key_cache_list, list) {
+               prt_printf(out, "\t%px %ps", pin, pin->flush);
+               prt_newline(out);
+       }
+
+       if (!list_empty(&pin_list->flushed)) {
+               prt_printf(out, "flushed:");
+               prt_newline(out);
+       }
 
-               list_for_each_entry(pin, &pin_list->flushed, list)
-                       pr_buf(out, "\t%px %ps\n",
-                              pin, pin->flush);
+       list_for_each_entry(pin, &pin_list->flushed, list) {
+               prt_printf(out, "\t%px %ps", pin, pin->flush);
+               prt_newline(out);
        }
+
+       printbuf_indent_sub(out, 2);
+
+       --out->atomic;
        spin_unlock(&j->lock);
+
+       return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+       u64 seq = 0;
+
+       while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+               seq++;
 }
index b298873212d2e598dff056b4328f4ce5a3a8e0f4..9428f4233997b0072f7fd473cf45b8e6575b80d0 100644 (file)
  */
 
 #include <linux/hash.h>
+#include <linux/prefetch.h>
 
 #include "journal_types.h"
 
@@ -141,7 +142,10 @@ static inline u64 journal_cur_seq(struct journal *j)
        return j->pin.back - 1;
 }
 
-void bch2_journal_set_has_inum(struct journal *, u64, u64);
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+       return j->seq_ondisk + 1;
+}
 
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
@@ -196,9 +200,9 @@ journal_res_entry(struct journal *j, struct journal_res *res)
        return vstruct_idx(j->buf[res->idx].data, res->offset);
 }
 
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
                                          enum btree_id id, unsigned level,
-                                         const void *data, unsigned u64s)
+                                         unsigned u64s)
 {
        entry->u64s     = cpu_to_le16(u64s);
        entry->btree_id = id;
@@ -207,32 +211,33 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type
        entry->pad[0]   = 0;
        entry->pad[1]   = 0;
        entry->pad[2]   = 0;
-       memcpy_u64s_small(entry->_data, data, u64s);
-
        return jset_u64s(u64s);
 }
 
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
-                                         unsigned type, enum btree_id id,
-                                         unsigned level,
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+                                         enum btree_id id, unsigned level,
                                          const void *data, unsigned u64s)
 {
-       unsigned actual = journal_entry_set(journal_res_entry(j, res),
-                              type, id, level, data, u64s);
+       unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+       memcpy_u64s_small(entry->_data, data, u64s);
+       return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+                        unsigned type, enum btree_id id,
+                        unsigned level, unsigned u64s)
+{
+       struct jset_entry *entry = journal_res_entry(j, res);
+       unsigned actual = journal_entry_init(entry, type, id, level, u64s);
 
        EBUG_ON(!res->ref);
        EBUG_ON(actual > res->u64s);
 
        res->offset     += actual;
        res->u64s       -= actual;
-}
-
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-                                       enum btree_id id, unsigned level,
-                                       const struct bkey_i *k)
-{
-       bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
-                              id, level, k, k->k.u64s);
+       return entry;
 }
 
 static inline bool journal_entry_empty(struct jset *j)
@@ -261,9 +266,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
                                    .buf3_count = idx == 3,
                                    }).v, &j->reservations.counter);
 
-       EBUG_ON(((s.idx - idx) & 3) >
-               ((s.idx - s.unwritten_idx) & 3));
-
        if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
                __bch2_journal_buf_put(j);
 }
@@ -283,7 +285,7 @@ static inline void bch2_journal_res_put(struct journal *j,
        while (res->u64s)
                bch2_journal_add_entry(j, res,
                                       BCH_JSET_ENTRY_btree_keys,
-                                      0, 0, NULL, 0);
+                                      0, 0, 0);
 
        bch2_journal_buf_put(j, res->idx);
 
@@ -293,9 +295,9 @@ static inline void bch2_journal_res_put(struct journal *j,
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
                                  unsigned);
 
-#define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
-#define JOURNAL_RES_GET_CHECK          (1 << 1)
-#define JOURNAL_RES_GET_RESERVED       (1 << 2)
+/* First two bits for JOURNAL_WATERMARK: */
+#define JOURNAL_RES_GET_NONBLOCK       (1 << 2)
+#define JOURNAL_RES_GET_CHECK          (1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
                                       struct journal_res *res,
@@ -303,24 +305,34 @@ static inline int journal_res_get_fast(struct journal *j,
 {
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
+       unsigned u64s, offset;
 
        do {
                old.v = new.v = v;
 
+               /*
+                * Round up the end of the journal reservation to the next
+                * cacheline boundary:
+                */
+               u64s = res->u64s;
+               offset = sizeof(struct jset) / sizeof(u64) +
+                         new.cur_entry_offset + u64s;
+               u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1;
+
+
                /*
                 * Check if there is still room in the current journal
                 * entry:
                 */
-               if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+               if (new.cur_entry_offset + u64s > j->cur_entry_u64s)
                        return 0;
 
                EBUG_ON(!journal_state_count(new, new.idx));
 
-               if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-                   !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+               if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
                        return 0;
 
-               new.cur_entry_offset += res->u64s;
+               new.cur_entry_offset += u64s;
                journal_state_inc(&new);
 
                /*
@@ -337,8 +349,15 @@ static inline int journal_res_get_fast(struct journal *j,
 
        res->ref        = true;
        res->idx        = old.idx;
+       res->u64s       = u64s;
        res->offset     = old.cur_entry_offset;
        res->seq        = le64_to_cpu(j->buf[old.idx].data->seq);
+
+       offset = res->offset;
+       while (offset < res->offset + res->u64s) {
+               prefetchw(vstruct_idx(j->buf[res->idx].data, offset));
+               offset += SMP_CACHE_BYTES / sizeof(u64);
+       }
        return 1;
 }
 
@@ -370,23 +389,27 @@ out:
 
 /* journal_preres: */
 
-static inline bool journal_check_may_get_unreserved(struct journal *j)
+static inline void journal_set_watermark(struct journal *j)
 {
        union journal_preres_state s = READ_ONCE(j->prereserved);
-       bool ret = s.reserved < s.remaining &&
-               fifo_free(&j->pin) > 8;
-
-       lockdep_assert_held(&j->lock);
-
-       if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-               if (ret) {
-                       set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-                       journal_wake(j);
-               } else {
-                       clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-               }
-       }
-       return ret;
+       unsigned watermark = JOURNAL_WATERMARK_any;
+
+       if (fifo_free(&j->pin) < j->pin.size / 4)
+               watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+       if (fifo_free(&j->pin) < j->pin.size / 8)
+               watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+       if (s.reserved > s.remaining)
+               watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+       if (!s.remaining)
+               watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+       if (watermark == j->watermark)
+               return;
+
+       swap(watermark, j->watermark);
+       if (watermark > j->watermark)
+               journal_wake(j);
 }
 
 static inline void bch2_journal_preres_put(struct journal *j,
@@ -406,12 +429,8 @@ static inline void bch2_journal_preres_put(struct journal *j,
                closure_wake_up(&j->preres_wait);
        }
 
-       if (s.reserved <= s.remaining &&
-           !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-               spin_lock(&j->lock);
-               journal_check_may_get_unreserved(j);
-               spin_unlock(&j->lock);
-       }
+       if (s.reserved <= s.remaining && j->watermark)
+               journal_set_watermark(j);
 }
 
 int __bch2_journal_preres_get(struct journal *,
@@ -432,8 +451,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
                old.v = new.v = v;
                ret = 0;
 
-               if ((flags & JOURNAL_RES_GET_RESERVED) ||
-                   test_bit(JOURNAL_NOCHANGES, &j->flags) ||
+               if ((flags & JOURNAL_WATERMARK_reserved) ||
                    new.reserved + d < new.remaining) {
                        new.reserved += d;
                        ret = 1;
@@ -479,6 +497,7 @@ int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_log_msg(struct journal *, const char *, ...);
 
 void bch2_journal_halt(struct journal *);
 
@@ -502,6 +521,7 @@ void bch2_journal_block(struct journal *);
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
                                unsigned nr);
@@ -510,7 +530,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 
 void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
 
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
index b5c204e7c5690616a69171522f6b71e25cad3210..c4922c64065323ebfe4703c7d782a8d2c1acc4c1 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_io.h"
 #include "btree_update_interior.h"
 
 #include <trace/events/bcachefs.h>
 
-static void __journal_replay_free(struct journal_replay *i)
+static struct nonce journal_nonce(const struct jset *jset)
+{
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = ((__le32 *) &jset->seq)[0],
+               [2] = ((__le32 *) &jset->seq)[1],
+               [3] = BCH_NONCE_JOURNAL,
+       }};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+       return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+               !bch2_crc_cmp(j->csum,
+                             csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
 {
-       list_del(&i->list);
+       return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+                                 struct journal_replay *i)
+{
+       struct journal_replay **p =
+               genradix_ptr(&c->journal_entries,
+                            journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+       BUG_ON(*p != i);
+       *p = NULL;
        kvpfree(i, offsetof(struct journal_replay, j) +
                vstruct_bytes(&i->j));
-
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -29,13 +57,13 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
        i->ignore = true;
 
        if (!c->opts.read_entire_journal)
-               __journal_replay_free(i);
+               __journal_replay_free(c, i);
 }
 
 struct journal_list {
        struct closure          cl;
+       u64                     last_seq;
        struct mutex            lock;
-       struct list_head        *head;
        int                     ret;
 };
 
@@ -47,94 +75,105 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-                            struct bch_extent_ptr entry_ptr,
-                            struct journal_list *jlist, struct jset *j,
-                            bool bad)
+                            struct journal_ptr entry_ptr,
+                            struct journal_list *jlist, struct jset *j)
 {
-       struct journal_replay *i, *pos, *dup = NULL;
-       struct bch_extent_ptr *ptr;
-       struct list_head *where;
+       struct genradix_iter iter;
+       struct journal_replay **_i, *i, *dup;
+       struct journal_ptr *ptr;
        size_t bytes = vstruct_bytes(j);
-       u64 last_seq = 0;
+       u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
        int ret = JOURNAL_ENTRY_ADD_OK;
 
-       list_for_each_entry_reverse(i, jlist->head, list) {
-               if (!JSET_NO_FLUSH(&i->j)) {
-                       last_seq = le64_to_cpu(i->j.last_seq);
-                       break;
-               }
-       }
-
        /* Is this entry older than the range we need? */
        if (!c->opts.read_entire_journal &&
-           le64_to_cpu(j->seq) < last_seq) {
-               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-               goto out;
-       }
+           le64_to_cpu(j->seq) < jlist->last_seq)
+               return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+       /*
+        * genradixes are indexed by a ulong, not a u64, so we can't index them
+        * by sequence number directly: Assume instead that they will all fall
+        * within the range of +-2billion of the filrst one we find.
+        */
+       if (!c->journal_entries_base_seq)
+               c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
 
        /* Drop entries we don't need anymore */
-       if (!JSET_NO_FLUSH(j)) {
-               list_for_each_entry_safe(i, pos, jlist->head, list) {
-                       if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+       if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+               genradix_for_each_from(&c->journal_entries, iter, _i,
+                                      journal_entry_radix_idx(c, jlist->last_seq)) {
+                       i = *_i;
+
+                       if (!i || i->ignore)
+                               continue;
+
+                       if (le64_to_cpu(i->j.seq) >= last_seq)
                                break;
                        journal_replay_free(c, i);
                }
        }
 
-       list_for_each_entry_reverse(i, jlist->head, list) {
-               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-                       where = &i->list;
-                       goto add;
-               }
-       }
-
-       where = jlist->head;
-add:
-       dup = where->next != jlist->head
-               ? container_of(where->next, struct journal_replay, list)
-               : NULL;
+       jlist->last_seq = max(jlist->last_seq, last_seq);
 
-       if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
-               dup = NULL;
+       _i = genradix_ptr_alloc(&c->journal_entries,
+                               journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+                               GFP_KERNEL);
+       if (!_i)
+               return -ENOMEM;
 
        /*
         * Duplicate journal entries? If so we want the one that didn't have a
         * checksum error:
         */
+       dup = *_i;
        if (dup) {
-               if (dup->bad) {
-                       /* we'll replace @dup: */
-               } else if (bad) {
+               if (bytes == vstruct_bytes(&dup->j) &&
+                   !memcmp(j, &dup->j, bytes)) {
                        i = dup;
                        goto found;
-               } else {
-                       fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
-                                   memcmp(j, &dup->j, bytes), c,
-                                   "found duplicate but non identical journal entries (seq %llu)",
-                                   le64_to_cpu(j->seq));
+               }
+
+               if (!entry_ptr.csum_good) {
                        i = dup;
                        goto found;
                }
-       }
 
-       i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-       if (!i) {
-               ret = -ENOMEM;
-               goto out;
+               if (!dup->csum_good)
+                       goto replace;
+
+               fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+                        le64_to_cpu(j->seq));
+               i = dup;
+               goto found;
        }
+replace:
+       i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+       if (!i)
+               return -ENOMEM;
 
-       i->nr_ptrs       = 0;
-       i->bad          = bad;
+       i->nr_ptrs      = 0;
+       i->csum_good    = entry_ptr.csum_good;
        i->ignore       = false;
        memcpy(&i->j, j, bytes);
+       i->ptrs[i->nr_ptrs++] = entry_ptr;
 
        if (dup) {
-               i->nr_ptrs = dup->nr_ptrs;
-               memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
-               __journal_replay_free(dup);
+               if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+                       bch_err(c, "found too many copies of journal entry %llu",
+                               le64_to_cpu(i->j.seq));
+                       dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+               }
+
+               /* The first ptr should represent the jset we kept: */
+               memcpy(i->ptrs + i->nr_ptrs,
+                      dup->ptrs,
+                      sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+               i->nr_ptrs += dup->nr_ptrs;
+               __journal_replay_free(c, dup);
        }
 
-       list_add(&i->list, where);
+       *_i = i;
+       return 0;
 found:
        for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
                if (ptr->dev == ca->dev_idx) {
@@ -156,16 +195,6 @@ fsck_err:
        return ret;
 }
 
-static struct nonce journal_nonce(const struct jset *jset)
-{
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = ((__le32 *) &jset->seq)[0],
-               [2] = ((__le32 *) &jset->seq)[1],
-               [3] = BCH_NONCE_JOURNAL,
-       }};
-}
-
 /* this fills in a range with empty jset_entries: */
 static void journal_entry_null_range(void *start, void *end)
 {
@@ -179,66 +208,84 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_NONE     6
 #define JOURNAL_ENTRY_BAD      7
 
-#define journal_entry_err(c, msg, ...)                                 \
+static void journal_entry_err_msg(struct printbuf *out,
+                                 struct jset *jset,
+                                 struct jset_entry *entry)
+{
+       prt_str(out, "invalid journal entry ");
+       if (entry)
+               prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
+
+       if (!jset)
+               prt_printf(out, "in superblock");
+       else if (!entry)
+               prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
+       else
+               prt_printf(out, "at offset %zi/%u seq %llu",
+                          (u64 *) entry - jset->_data,
+                          le32_to_cpu(jset->u64s),
+                          le64_to_cpu(jset->seq));
+       prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, jset, entry, msg, ...)                    \
 ({                                                                     \
+       struct printbuf buf = PRINTBUF;                                 \
+                                                                       \
+       journal_entry_err_msg(&buf, jset, entry);                       \
+       prt_printf(&buf, msg, ##__VA_ARGS__);                           \
+                                                                       \
        switch (write) {                                                \
        case READ:                                                      \
-               mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
+               mustfix_fsck_err(c, "%s", buf.buf);                     \
                break;                                                  \
        case WRITE:                                                     \
-               bch_err(c, "corrupt metadata before write:\n"           \
-                       msg, ##__VA_ARGS__);                            \
+               bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
                if (bch2_fs_inconsistent(c)) {                          \
-                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
                }                                                       \
                break;                                                  \
        }                                                               \
+                                                                       \
+       printbuf_exit(&buf);                                            \
        true;                                                           \
 })
 
-#define journal_entry_err_on(cond, c, msg, ...)                                \
-       ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, jset, entry, msg, ...)           \
+       ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
 
 #define FSCK_DELETED_KEY       5
 
-static int journal_validate_key(struct bch_fs *c, const char *where,
+static int journal_validate_key(struct bch_fs *c,
+                               struct jset *jset,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
-                               struct bkey_i *k, const char *type,
+                               struct bkey_i *k,
                                unsigned version, int big_endian, int write)
 {
        void *next = vstruct_next(entry);
-       const char *invalid;
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
-       if (journal_entry_err_on(!k->k.u64s, c,
-                       "invalid %s in %s entry offset %zi/%u: k->u64s 0",
-                       type, where,
-                       (u64 *) k - entry->_data,
-                       le16_to_cpu(entry->u64s))) {
+       if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
                return FSCK_DELETED_KEY;
        }
 
        if (journal_entry_err_on((void *) bkey_next(k) >
-                               (void *) vstruct_next(entry), c,
-                       "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
-                       type, where,
-                       (u64 *) k - entry->_data,
-                       le16_to_cpu(entry->u64s))) {
+                                (void *) vstruct_next(entry),
+                                c, jset, entry,
+                                "extends past end of journal entry")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
                return FSCK_DELETED_KEY;
        }
 
-       if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-                       "invalid %s in %s entry offset %zi/%u: bad format %u",
-                       type, where,
-                       (u64 *) k - entry->_data,
-                       le16_to_cpu(entry->u64s),
-                       k->k.format)) {
+       if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+                                c, jset, entry,
+                                "bad format %u", k->k.format)) {
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
@@ -249,21 +296,29 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
                bch2_bkey_compat(level, btree_id, version, big_endian,
                                 write, NULL, bkey_to_packed(k));
 
-       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-                                   __btree_node_type(level, btree_id));
-       if (invalid) {
-               char buf[160];
+       if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+                             __btree_node_type(level, btree_id), write, &buf)) {
+               printbuf_reset(&buf);
+               prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
+                          bch2_jset_entry_types[entry->type],
+                          (u64 *) entry - jset->_data,
+                          le32_to_cpu(jset->u64s),
+                          le64_to_cpu(jset->seq));
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+               prt_newline(&buf);
+               bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+                                 __btree_node_type(level, btree_id), write, &buf);
 
-               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-               mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
-                                type, where,
-                                (u64 *) k - entry->_data,
-                                le16_to_cpu(entry->u64s),
-                                invalid, buf);
+               mustfix_fsck_err(c, "%s", buf.buf);
 
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
+
+               printbuf_exit(&buf);
                return FSCK_DELETED_KEY;
        }
 
@@ -271,21 +326,22 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
                bch2_bkey_compat(level, btree_id, version, big_endian,
                                 write, NULL, bkey_to_packed(k));
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
 static int journal_entry_btree_keys_validate(struct bch_fs *c,
-                                            const char *where,
+                                            struct jset *jset,
                                             struct jset_entry *entry,
                                             unsigned version, int big_endian, int write)
 {
        struct bkey_i *k = entry->start;
 
        while (k != vstruct_last(entry)) {
-               int ret = journal_validate_key(c, where, entry,
+               int ret = journal_validate_key(c, jset, entry,
                                               entry->level,
                                               entry->btree_id,
-                                              k, "key", version, big_endian, write);
+                                              k, version, big_endian, write);
                if (ret == FSCK_DELETED_KEY)
                        continue;
 
@@ -303,17 +359,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 
        vstruct_for_each(entry, k) {
                if (!first) {
-                       printbuf_newline(out);
-                       pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+                       prt_newline(out);
+                       prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
                }
-               pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+               prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
                bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
                first = false;
        }
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
-                                            const char *where,
+                                            struct jset *jset,
                                             struct jset_entry *entry,
                                             unsigned version, int big_endian, int write)
 {
@@ -321,7 +377,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(!entry->u64s ||
-                                le16_to_cpu(entry->u64s) != k->k.u64s, c,
+                                le16_to_cpu(entry->u64s) != k->k.u64s,
+                                c, jset, entry,
                                 "invalid btree root journal entry: wrong number of keys")) {
                void *next = vstruct_next(entry);
                /*
@@ -334,8 +391,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
                return 0;
        }
 
-       return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
-                                   "btree root", version, big_endian, write);
+       return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+                                   version, big_endian, write);
 fsck_err:
        return ret;
 }
@@ -347,7 +404,7 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-                                           const char *where,
+                                           struct jset *jset,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
 {
@@ -361,13 +418,14 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_validate(struct bch_fs *c,
-                                           const char *where,
+                                           struct jset *jset,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
 {
        int ret = 0;
 
-       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+                                c, jset, entry,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -381,18 +439,19 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
        struct jset_entry_blacklist *bl =
                container_of(entry, struct jset_entry_blacklist, entry);
 
-       pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+       prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-                                              const char *where,
+                                              struct jset *jset,
                                               struct jset_entry *entry,
                                               unsigned version, int big_endian, int write)
 {
        struct jset_entry_blacklist_v2 *bl_entry;
        int ret = 0;
 
-       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+                                c, jset, entry,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                goto out;
@@ -401,7 +460,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
        bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
 
        if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
-                                le64_to_cpu(bl_entry->end), c,
+                                le64_to_cpu(bl_entry->end),
+                                c, jset, entry,
                "invalid journal seq blacklist entry: start > end")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -416,13 +476,13 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
        struct jset_entry_blacklist_v2 *bl =
                container_of(entry, struct jset_entry_blacklist_v2, entry);
 
-       pr_buf(out, "start=%llu end=%llu",
+       prt_printf(out, "start=%llu end=%llu",
               le64_to_cpu(bl->start),
               le64_to_cpu(bl->end));
 }
 
 static int journal_entry_usage_validate(struct bch_fs *c,
-                                       const char *where,
+                                       struct jset *jset,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
 {
@@ -432,7 +492,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < sizeof(*u),
-                                c,
+                                c, jset, entry,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -448,13 +508,13 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
        struct jset_entry_usage *u =
                container_of(entry, struct jset_entry_usage, entry);
 
-       pr_buf(out, "type=%s v=%llu",
+       prt_printf(out, "type=%s v=%llu",
               bch2_fs_usage_types[u->entry.btree_id],
               le64_to_cpu(u->v));
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
-                                       const char *where,
+                                       struct jset *jset,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
 {
@@ -465,7 +525,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 
        if (journal_entry_err_on(bytes < sizeof(*u) ||
                                 bytes < sizeof(*u) + u->r.nr_devs,
-                                c,
+                                c, jset, entry,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -482,11 +542,11 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
                container_of(entry, struct jset_entry_data_usage, entry);
 
        bch2_replicas_entry_to_text(out, &u->r);
-       pr_buf(out, "=%llu", le64_to_cpu(u->v));
+       prt_printf(out, "=%llu", le64_to_cpu(u->v));
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
-                                       const char *where,
+                                       struct jset *jset,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
 {
@@ -496,13 +556,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes != sizeof(*clock),
-                                c, "invalid journal entry clock: bad size")) {
+                                c, jset, entry, "bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(clock->rw > 1,
-                                c, "invalid journal entry clock: bad rw")) {
+                                c, jset, entry, "bad rw")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -517,11 +577,11 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
        struct jset_entry_clock *clock =
                container_of(entry, struct jset_entry_clock, entry);
 
-       pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+       prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-                                           const char *where,
+                                           struct jset *jset,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
 {
@@ -533,7 +593,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < expected,
-                                c, "invalid journal entry dev usage: bad size (%u < %u)",
+                                c, jset, entry, "bad size (%u < %u)",
                                 bytes, expected)) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -542,13 +602,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        dev = le32_to_cpu(u->dev);
 
        if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-                                c, "invalid journal entry dev usage: bad dev")) {
+                                c, jset, entry, "bad dev")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(u->pad,
-                                c, "invalid journal entry dev usage: bad pad")) {
+                                c, jset, entry, "bad pad")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -564,26 +624,24 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
                container_of(entry, struct jset_entry_dev_usage, entry);
        unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
-       pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+       prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
 
        for (i = 0; i < nr_types; i++) {
                if (i < BCH_DATA_NR)
-                       pr_buf(out, " %s", bch2_data_types[i]);
+                       prt_printf(out, " %s", bch2_data_types[i]);
                else
-                       pr_buf(out, " (unknown data type %u)", i);
-               pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+                       prt_printf(out, " (unknown data type %u)", i);
+               prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
                       le64_to_cpu(u->d[i].buckets),
                       le64_to_cpu(u->d[i].sectors),
                       le64_to_cpu(u->d[i].fragmented));
        }
 
-       pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
-              le64_to_cpu(u->buckets_ec),
-              le64_to_cpu(u->buckets_unavailable));
+       prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
-                                     const char *where,
+                                     struct jset *jset,
                                      struct jset_entry *entry,
                                      unsigned version, int big_endian, int write)
 {
@@ -596,11 +654,25 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
        struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
        unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
 
-       bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+       prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+                                     struct jset *jset,
+                                     struct jset_entry *entry,
+                                     unsigned version, int big_endian, int write)
+{
+       return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+       journal_entry_btree_keys_to_text(out, c, entry);
 }
 
 struct jset_entry_ops {
-       int (*validate)(struct bch_fs *, const char *,
+       int (*validate)(struct bch_fs *, struct jset *,
                        struct jset_entry *, unsigned, int, int);
        void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
@@ -615,12 +687,13 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #undef x
 };
 
-int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+int bch2_journal_entry_validate(struct bch_fs *c,
+                               struct jset *jset,
                                struct jset_entry *entry,
                                unsigned version, int big_endian, int write)
 {
        return entry->type < BCH_JSET_ENTRY_NR
-               ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+               ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
                                version, big_endian, write)
                : 0;
 }
@@ -629,34 +702,28 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
                                struct jset_entry *entry)
 {
        if (entry->type < BCH_JSET_ENTRY_NR) {
-               pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+               prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
                bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
        } else {
-               pr_buf(out, "(unknown type %u)", entry->type);
+               prt_printf(out, "(unknown type %u)", entry->type);
        }
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                                 int write)
 {
-       char buf[100];
        struct jset_entry *entry;
        int ret = 0;
 
        vstruct_for_each(jset, entry) {
-               scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
-                         le64_to_cpu(jset->seq),
-                         (u64 *) entry - jset->_data,
-                         le32_to_cpu(jset->u64s));
-
                if (journal_entry_err_on(vstruct_next(entry) >
-                                        vstruct_last(jset), c,
+                                        vstruct_last(jset), c, jset, entry,
                                "journal entry extends past end of jset")) {
                        jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
                        break;
                }
 
-               ret = bch2_journal_entry_validate(c, buf, entry,
+               ret = bch2_journal_entry_validate(c, jset, entry,
                                        le32_to_cpu(jset->version),
                                        JSET_BIG_ENDIAN(jset), write);
                if (ret)
@@ -669,12 +736,8 @@ fsck_err:
 static int jset_validate(struct bch_fs *c,
                         struct bch_dev *ca,
                         struct jset *jset, u64 sector,
-                        unsigned bucket_sectors_left,
-                        unsigned sectors_read,
                         int write)
 {
-       size_t bytes = vstruct_bytes(jset);
-       struct bch_csum csum;
        unsigned version;
        int ret = 0;
 
@@ -684,70 +747,80 @@ static int jset_validate(struct bch_fs *c,
        version = le32_to_cpu(jset->version);
        if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
                                  version < bcachefs_metadata_version_min) ||
-                                version >= bcachefs_metadata_version_max, c,
+                                version >= bcachefs_metadata_version_max,
+                                c, jset, NULL,
                        "%s sector %llu seq %llu: unknown journal entry version %u",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
                        version)) {
                /* don't try to continue: */
-               return EINVAL;
+               return -EINVAL;
        }
 
-       if (bytes > (sectors_read << 9) &&
-           sectors_read < bucket_sectors_left)
-               return JOURNAL_ENTRY_REREAD;
-
-       if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq), bytes)) {
-               ret = JOURNAL_ENTRY_BAD;
-               le32_add_cpu(&jset->u64s,
-                            -((bytes - (bucket_sectors_left << 9)) / 8));
-       }
-
-       if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+       if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+                                c, jset, NULL,
                        "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
-                       JSET_CSUM_TYPE(jset))) {
+                       JSET_CSUM_TYPE(jset)))
                ret = JOURNAL_ENTRY_BAD;
-               goto csum_done;
-       }
 
-       if (write)
-               goto csum_done;
-
-       csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-       if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-                                "%s sector %llu seq %llu: journal checksum bad",
-                                ca ? ca->name : c->name,
-                                sector, le64_to_cpu(jset->seq)))
-               ret = JOURNAL_ENTRY_BAD;
-
-       bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-                    jset->encrypted_start,
-                    vstruct_end(jset) - (void *) jset->encrypted_start);
-csum_done:
        /* last_seq is ignored when JSET_NO_FLUSH is true */
        if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
-                                le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+                                le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+                                c, jset, NULL,
                                 "invalid journal entry: last_seq > seq (%llu > %llu)",
                                 le64_to_cpu(jset->last_seq),
                                 le64_to_cpu(jset->seq))) {
                jset->last_seq = jset->seq;
                return JOURNAL_ENTRY_BAD;
        }
+
+       ret = jset_validate_entries(c, jset, write);
 fsck_err:
        return ret;
 }
 
-static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+static int jset_validate_early(struct bch_fs *c,
+                        struct bch_dev *ca,
+                        struct jset *jset, u64 sector,
+                        unsigned bucket_sectors_left,
+                        unsigned sectors_read)
 {
-       unsigned sectors = vstruct_sectors(jset, c->block_bits);
+       size_t bytes = vstruct_bytes(jset);
+       unsigned version;
+       int write = READ;
+       int ret = 0;
+
+       if (le64_to_cpu(jset->magic) != jset_magic(c))
+               return JOURNAL_ENTRY_NONE;
 
-       return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
-               jset_validate_entries(c, jset, WRITE);
+       version = le32_to_cpu(jset->version);
+       if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+                                 version < bcachefs_metadata_version_min) ||
+                                version >= bcachefs_metadata_version_max,
+                                c, jset, NULL,
+                       "%s sector %llu seq %llu: unknown journal entry version %u",
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq),
+                       version)) {
+               /* don't try to continue: */
+               return -EINVAL;
+       }
+
+       if (bytes > (sectors_read << 9) &&
+           sectors_read < bucket_sectors_left)
+               return JOURNAL_ENTRY_REREAD;
+
+       if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+                                c, jset, NULL,
+                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq), bytes))
+               le32_add_cpu(&jset->u64s,
+                            -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+       return ret;
 }
 
 struct journal_read_buf {
@@ -786,7 +859,7 @@ static int journal_read_bucket(struct bch_dev *ca,
        unsigned sectors, sectors_read = 0;
        u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
            end = offset + ca->mi.bucket_size;
-       bool saw_bad = false;
+       bool saw_bad = false, csum_good;
        int ret = 0;
 
        pr_debug("reading %u", bucket);
@@ -794,20 +867,20 @@ static int journal_read_bucket(struct bch_dev *ca,
        while (offset < end) {
                if (!sectors_read) {
                        struct bio *bio;
+                       unsigned nr_bvecs;
 reread:
                        sectors_read = min_t(unsigned,
                                end - offset, buf->size >> 9);
+                       nr_bvecs = buf_pages(buf->data, sectors_read << 9);
 
-                       bio = bio_kmalloc(GFP_KERNEL,
-                                         buf_pages(buf->data,
-                                                   sectors_read << 9));
-                       bio_set_dev(bio, ca->disk_sb.bdev);
-                       bio->bi_iter.bi_sector  = offset;
-                       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+                       bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+                       bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+                       bio->bi_iter.bi_sector = offset;
                        bch2_bio_map(bio, buf->data, sectors_read << 9);
 
                        ret = submit_bio_wait(bio);
-                       bio_put(bio);
+                       kfree(bio);
 
                        if (bch2_dev_io_err_on(ret, ca,
                                               "journal read error: sector %llu",
@@ -825,11 +898,10 @@ reread:
                        j = buf->data;
                }
 
-               ret = jset_validate(c, ca, j, offset,
-                                   end - offset, sectors_read,
-                                   READ);
+               ret = jset_validate_early(c, ca, j, offset,
+                                   end - offset, sectors_read);
                switch (ret) {
-               case BCH_FSCK_OK:
+               case 0:
                        sectors = vstruct_sectors(j, c->block_bits);
                        break;
                case JOURNAL_ENTRY_REREAD:
@@ -843,17 +915,13 @@ reread:
                case JOURNAL_ENTRY_NONE:
                        if (!saw_bad)
                                return 0;
-                       sectors = block_sectors(c);
-                       goto next_block;
-               case JOURNAL_ENTRY_BAD:
-                       saw_bad = true;
                        /*
                         * On checksum error we don't really trust the size
                         * field of the journal entry we read, so try reading
                         * again at next block boundary:
                         */
                        sectors = block_sectors(c);
-                       break;
+                       goto next_block;
                default:
                        return ret;
                }
@@ -869,11 +937,25 @@ reread:
 
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
+               csum_good = jset_csum_good(c, j);
+               if (!csum_good)
+                       saw_bad = true;
+
+               ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+                            j->encrypted_start,
+                            vstruct_end(j) - (void *) j->encrypted_start);
+               bch2_fs_fatal_err_on(ret, c,
+                               "error decrypting journal entry: %i", ret);
+
                mutex_lock(&jlist->lock);
-               ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
-                                       .dev = ca->dev_idx,
-                                       .offset = offset,
-                                       }, jlist, j, ret != 0);
+               ret = journal_entry_add(c, ca, (struct journal_ptr) {
+                                       .csum_good      = csum_good,
+                                       .dev            = ca->dev_idx,
+                                       .bucket         = bucket,
+                                       .bucket_offset  = offset -
+                                               bucket_to_sector(ca, ja->buckets[bucket]),
+                                       .sector         = offset,
+                                       }, jlist, j);
                mutex_unlock(&jlist->lock);
 
                switch (ret) {
@@ -902,6 +984,8 @@ static void bch2_journal_read_device(struct closure *cl)
        struct bch_fs *c = ca->fs;
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
+       struct journal_replay *r, **_r;
+       struct genradix_iter iter;
        struct journal_read_buf buf = { NULL, 0 };
        u64 min_seq = U64_MAX;
        unsigned i;
@@ -937,11 +1021,42 @@ static void bch2_journal_read_device(struct closure *cl)
         * allocate
         */
        while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-              ja->bucket_seq[ja->cur_idx] >
+              ja->bucket_seq[ja->cur_idx] ==
               ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
                ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-       ja->sectors_free = 0;
+       ja->sectors_free = ca->mi.bucket_size;
+
+       mutex_lock(&jlist->lock);
+       genradix_for_each(&c->journal_entries, iter, _r) {
+               r = *_r;
+
+               if (!r)
+                       continue;
+
+               for (i = 0; i < r->nr_ptrs; i++) {
+                       if (r->ptrs[i].dev == ca->dev_idx &&
+                           sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+                               unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+                                       vstruct_sectors(&r->j, c->block_bits);
+
+                               ja->sectors_free = min(ja->sectors_free,
+                                                      ca->mi.bucket_size - wrote);
+                       }
+               }
+       }
+       mutex_unlock(&jlist->lock);
+
+       if (ja->bucket_seq[ja->cur_idx] &&
+           ja->sectors_free == ca->mi.bucket_size) {
+               bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+               bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+               for (i = 0; i < 3; i++) {
+                       unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+                       bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+               }
+               ja->sectors_free = 0;
+       }
 
        /*
         * Set dirty_idx to indicate the entire journal is full and needs to be
@@ -963,8 +1078,8 @@ err:
        goto out;
 }
 
-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-                                     struct journal_replay *j)
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                              struct journal_replay *j)
 {
        unsigned i;
 
@@ -972,23 +1087,26 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
                u64 offset;
 
-               div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+               div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
 
                if (i)
-                       pr_buf(out, " ");
-               pr_buf(out, "%u:%llu (offset %llu)",
+                       prt_printf(out, " ");
+               prt_printf(out, "%u:%u:%u (sector %llu)",
                       j->ptrs[i].dev,
-                      (u64) j->ptrs[i].offset, offset);
+                      j->ptrs[i].bucket,
+                      j->ptrs[i].bucket_offset,
+                      j->ptrs[i].sector);
        }
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
-                     u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i, *t;
+       struct journal_replay *i, **_i, *prev = NULL;
+       struct genradix_iter radix_iter;
        struct bch_dev *ca;
        unsigned iter;
+       struct printbuf buf = PRINTBUF;
        size_t keys = 0, entries = 0;
        bool degraded = false;
        u64 seq, last_seq = 0;
@@ -996,11 +1114,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        closure_init_stack(&jlist.cl);
        mutex_init(&jlist.lock);
-       jlist.head = list;
+       jlist.last_seq = 0;
        jlist.ret = 0;
 
        for_each_member_device(ca, c, iter) {
-               if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+               if (!c->opts.fsck &&
                    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
                        continue;
 
@@ -1020,23 +1138,30 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        if (jlist.ret)
                return jlist.ret;
 
-       if (list_empty(list)) {
-               bch_info(c, "journal read done, but no entries found");
-               return 0;
-       }
-
-       i = list_last_entry(list, struct journal_replay, list);
-       *start_seq = le64_to_cpu(i->j.seq) + 1;
+       *start_seq = 0;
 
        /*
         * Find most recent flush entry, and ignore newer non flush entries -
         * those entries will be blacklisted:
         */
-       list_for_each_entry_safe_reverse(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
+               if (!*start_seq)
+                       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
                if (!JSET_NO_FLUSH(&i->j)) {
+                       int write = READ;
+                       if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+                                                c, &i->j, NULL,
+                                                "invalid journal entry: last_seq > seq (%llu > %llu)",
+                                                le64_to_cpu(i->j.last_seq),
+                                                le64_to_cpu(i->j.seq)))
+                               i->j.last_seq = i->j.seq;
+
                        last_seq        = le64_to_cpu(i->j.last_seq);
                        *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
                        break;
@@ -1045,14 +1170,22 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                journal_replay_free(c, i);
        }
 
+       if (!*start_seq) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
        if (!last_seq) {
                fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
-               return -1;
+               ret = -1;
+               goto err;
        }
 
        /* Drop blacklisted entries and entries older than last_seq: */
-       list_for_each_entry_safe(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                seq = le64_to_cpu(i->j.seq);
@@ -1071,15 +1204,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        /* Check for missing entries: */
        seq = last_seq;
-       list_for_each_entry(i, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                BUG_ON(seq > le64_to_cpu(i->j.seq));
 
                while (seq < le64_to_cpu(i->j.seq)) {
                        u64 missing_start, missing_end;
-                       char buf1[200], buf2[200];
+                       struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
                        while (seq < le64_to_cpu(i->j.seq) &&
                               bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -1094,15 +1229,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                               !bch2_journal_seq_is_blacklisted(c, seq, false))
                                seq++;
 
-                       if (i->list.prev != list) {
-                               struct printbuf out = PBUF(buf1);
-                               struct journal_replay *p = list_prev_entry(i, list);
-
-                               bch2_journal_ptrs_to_text(&out, c, p);
-                               pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+                       if (prev) {
+                               bch2_journal_ptrs_to_text(&buf1, c, prev);
+                               prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
                        } else
-                               sprintf(buf1, "(none)");
-                       bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+                               prt_printf(&buf1, "(none)");
+                       bch2_journal_ptrs_to_text(&buf2, c, i);
 
                        missing_end = seq - 1;
                        fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
@@ -1110,13 +1242,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                                 "  next at %s",
                                 missing_start, missing_end,
                                 last_seq, *blacklist_seq - 1,
-                                buf1, buf2);
+                                buf1.buf, buf2.buf);
+
+                       printbuf_exit(&buf1);
+                       printbuf_exit(&buf2);
                }
 
+               prev = i;
                seq++;
        }
 
-       list_for_each_entry(i, list, list) {
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas = {
@@ -1124,14 +1260,28 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                        .e.nr_required = 1,
                };
                unsigned ptr;
-               char buf[80];
 
-               if (i->ignore)
+               i = *_i;
+               if (!i || i->ignore)
                        continue;
 
-               ret = jset_validate_entries(c, &i->j, READ);
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+                       if (!i->ptrs[ptr].csum_good)
+                               printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
+                                      ca->name, i->ptrs[ptr].sector,
+                                      le64_to_cpu(i->j.seq),
+                                      i->csum_good ? " (had good copy on another device)" : "");
+               }
+
+               ret = jset_validate(c,
+                                   bch_dev_bkey_exists(c, i->ptrs[0].dev),
+                                   &i->j,
+                                   i->ptrs[0].sector,
+                                   READ);
                if (ret)
-                       goto fsck_err;
+                       goto err;
 
                for (ptr = 0; ptr < i->nr_ptrs; ptr++)
                        replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
@@ -1143,15 +1293,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                 * the devices - this is wrong:
                 */
 
+               printbuf_reset(&buf);
+               bch2_replicas_entry_to_text(&buf, &replicas.e);
+
                if (!degraded &&
-                   (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-                                "superblock not marked as containing replicas %s",
-                                (bch2_replicas_entry_to_text(&PBUF(buf),
-                                                             &replicas.e), buf)))) {
+                   fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+                               "superblock not marked as containing replicas %s",
+                               buf.buf)) {
                        ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
-                               return ret;
+                               goto err;
                }
 
                for_each_jset_key(k, _n, entry, &i->j)
@@ -1165,7 +1316,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        if (*start_seq != *blacklist_seq)
                bch_info(c, "dropped unflushed entries %llu-%llu",
                         *blacklist_seq, *start_seq - 1);
+err:
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -1292,49 +1445,6 @@ done:
        return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
-static void journal_write_compact(struct jset *jset)
-{
-       struct jset_entry *i, *next, *prev = NULL;
-
-       /*
-        * Simple compaction, dropping empty jset_entries (from journal
-        * reservations that weren't fully used) and merging jset_entries that
-        * can be.
-        *
-        * If we wanted to be really fancy here, we could sort all the keys in
-        * the jset and drop keys that were overwritten - probably not worth it:
-        */
-       vstruct_for_each_safe(jset, i, next) {
-               unsigned u64s = le16_to_cpu(i->u64s);
-
-               /* Empty entry: */
-               if (!u64s)
-                       continue;
-
-               /* Can we merge with previous entry? */
-               if (prev &&
-                   i->btree_id == prev->btree_id &&
-                   i->level    == prev->level &&
-                   i->type     == prev->type &&
-                   i->type     == BCH_JSET_ENTRY_btree_keys &&
-                   le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-                       memmove_u64s_down(vstruct_next(prev),
-                                         i->_data,
-                                         u64s);
-                       le16_add_cpu(&prev->u64s, u64s);
-                       continue;
-               }
-
-               /* Couldn't merge, move i into new position (after prev): */
-               prev = prev ? vstruct_next(prev) : jset->start;
-               if (i != prev)
-                       memmove_u64s_down(prev, i, jset_u64s(u64s));
-       }
-
-       prev = prev ? vstruct_next(prev) : jset->start;
-       jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
        /* we aren't holding j->lock: */
@@ -1360,7 +1470,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 {
-       return j->buf + j->reservations.unwritten_idx;
+       return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 }
 
 static void journal_write_done(struct closure *cl)
@@ -1397,15 +1507,18 @@ static void journal_write_done(struct closure *cl)
                journal_seq_pin(j, seq)->devs = w->devs_written;
 
        if (!err) {
-               j->seq_ondisk           = seq;
-
                if (!JSET_NO_FLUSH(w->data)) {
                        j->flushed_seq_ondisk = seq;
                        j->last_seq_ondisk = w->last_seq;
+
+                       bch2_do_discards(c);
+                       closure_wake_up(&c->freelist_wait);
                }
        } else if (!j->err_seq || seq < j->err_seq)
                j->err_seq      = seq;
 
+       j->seq_ondisk           = seq;
+
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
         * more buckets:
@@ -1413,7 +1526,8 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       journal_reclaim_kick(&c->journal);
+       if (j->watermark)
+               journal_reclaim_kick(&c->journal);
 
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
@@ -1421,7 +1535,7 @@ static void journal_write_done(struct closure *cl)
        v = atomic64_read(&j->reservations.counter);
        do {
                old.v = new.v = v;
-               BUG_ON(new.idx == new.unwritten_idx);
+               BUG_ON(journal_state_count(new, new.unwritten_idx));
 
                new.unwritten_idx++;
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -1432,13 +1546,24 @@ static void journal_write_done(struct closure *cl)
        closure_wake_up(&w->wait);
        journal_wake(j);
 
-       if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-               mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
-       spin_unlock(&j->lock);
-
-       if (new.unwritten_idx != new.idx &&
-           !journal_state_count(new, new.unwritten_idx))
+       if (!journal_state_count(new, new.unwritten_idx) &&
+           journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
                closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+       } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+                  new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+               struct journal_buf *buf = journal_cur_buf(j);
+               long delta = buf->expires - jiffies;
+
+               /*
+                * We don't close a journal entry to write it while there's
+                * previous entries still in flight - the current journal entry
+                * might want to be written now:
+                */
+
+               mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+       }
+
+       spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1483,12 +1608,10 @@ static void do_journal_write(struct closure *cl)
                             sectors);
 
                bio = ca->journal.bio;
-               bio_reset(bio);
-               bio_set_dev(bio, ca->disk_sb.bdev);
+               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = ptr->offset;
                bio->bi_end_io          = journal_write_endio;
                bio->bi_private         = ca;
-               bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
 
                BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
                ca->prev_journal_sector = bio->bi_iter.bi_sector;
@@ -1500,7 +1623,7 @@ static void do_journal_write(struct closure *cl)
 
                bch2_bio_map(bio, w->data, sectors << 9);
 
-               trace_journal_write(bio);
+               trace_and_count(c, journal_write, bio);
                closure_bio_submit(bio, cl);
 
                ca->journal.bucket_seq[ca->journal.cur_idx] =
@@ -1520,7 +1643,7 @@ void bch2_journal_write(struct closure *cl)
        struct jset_entry *start, *end;
        struct jset *jset;
        struct bio *bio;
-       char *journal_debug_buf = NULL;
+       struct printbuf journal_debug_buf = PRINTBUF;
        bool validate_before_checksum = false;
        unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
        int ret;
@@ -1533,11 +1656,11 @@ void bch2_journal_write(struct closure *cl)
        j->write_start_time = local_clock();
 
        spin_lock(&j->lock);
-       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
-           (w->noflush ||
-            (!w->must_flush &&
-             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
+       if (bch2_journal_error(j) ||
+           w->noflush ||
+           (!w->must_flush &&
+            (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+            test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
                w->noflush = true;
                SET_JSET_NO_FLUSH(jset, true);
                jset->last_seq  = 0;
@@ -1574,10 +1697,8 @@ void bch2_journal_write(struct closure *cl)
        le32_add_cpu(&jset->u64s, u64s);
        BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 
-       journal_write_compact(jset);
-
        jset->magic             = cpu_to_le64(jset_magic(c));
-       jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
+       jset->version           = c->sb.version < bcachefs_metadata_version_bkey_renumber
                ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                : cpu_to_le32(c->sb.version);
 
@@ -1594,18 +1715,21 @@ void bch2_journal_write(struct closure *cl)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
-           jset_validate_for_write(c, jset))
+           jset_validate(c, NULL, jset, 0, WRITE))
                goto err;
 
-       bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+       ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                    jset->encrypted_start,
                    vstruct_end(jset) - (void *) jset->encrypted_start);
+       if (bch2_fs_fatal_err_on(ret, c,
+                       "error decrypting journal entry: %i", ret))
+               goto err;
 
        jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
                                  journal_nonce(jset), jset);
 
        if (!validate_before_checksum &&
-           jset_validate_for_write(c, jset))
+           jset_validate(c, NULL, jset, 0, WRITE))
                goto err;
 
        sectors = vstruct_sectors(jset, c->block_bits);
@@ -1624,11 +1748,8 @@ retry_alloc:
                goto retry_alloc;
        }
 
-       if (ret) {
-               journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-               if (journal_debug_buf)
-                       __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-       }
+       if (ret)
+               __bch2_journal_debug_to_text(&journal_debug_buf, j);
 
        /*
         * write is allocated, no longer need to account for it in
@@ -1645,8 +1766,8 @@ retry_alloc:
 
        if (ret) {
                bch_err(c, "Unable to allocate journal write:\n%s",
-                       journal_debug_buf);
-               kfree(journal_debug_buf);
+                       journal_debug_buf.buf);
+               printbuf_exit(&journal_debug_buf);
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, c->io_complete_wq);
                return;
@@ -1654,7 +1775,7 @@ retry_alloc:
 
        w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
-       if (test_bit(JOURNAL_NOCHANGES, &j->flags))
+       if (c->opts.nochanges)
                goto no_io;
 
        for_each_rw_member(ca, c, i)
@@ -1668,9 +1789,7 @@ retry_alloc:
                        percpu_ref_get(&ca->io_ref);
 
                        bio = ca->journal.bio;
-                       bio_reset(bio);
-                       bio_set_dev(bio, ca->disk_sb.bdev);
-                       bio->bi_opf             = REQ_OP_FLUSH;
+                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
                        bio->bi_end_io          = journal_write_endio;
                        bio->bi_private         = ca;
                        closure_bio_submit(bio, cl);
index d8425fe0d67b6826c2de50196d3af23d95f16d55..2f8bbf06b28951301d8c0469eed1e0f934adf18b 100644 (file)
@@ -7,12 +7,16 @@
  * during cache_registration
  */
 struct journal_replay {
-       struct list_head        list;
-       struct bch_extent_ptr   ptrs[BCH_REPLICAS_MAX];
+       struct journal_ptr {
+               bool            csum_good;
+               u8              dev;
+               u32             bucket;
+               u32             bucket_offset;
+               u64             sector;
+       }                       ptrs[BCH_REPLICAS_MAX];
        unsigned                nr_ptrs;
 
-       /* checksum error, but we may want to try using it anyways: */
-       bool                    bad;
+       bool                    csum_good;
        bool                    ignore;
        /* must be last: */
        struct jset             j;
@@ -40,12 +44,15 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_entry_validate(struct bch_fs *, const char *,
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
                                struct jset_entry *, unsigned, int, int);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
                                struct jset_entry *);
 
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+                              struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
index 52a3935cff530748ce5d5adb55ed55411b1d80cb..e873ce2a3f03a5e9c2ba4d4cfc2ff87c30065ad2 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
@@ -34,10 +35,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
                                            struct journal_device *ja,
                                            enum journal_space_from from)
 {
-       unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
-               ? ((journal_space_from(ja, from) -
-                   ja->cur_idx - 1 + ja->nr) % ja->nr)
-               : ja->nr;
+       unsigned available = (journal_space_from(ja, from) -
+                             ja->cur_idx - 1 + ja->nr) % ja->nr;
 
        /*
         * Don't use the last bucket unless writing the new last_seq
@@ -61,25 +60,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
                                       old.v, new.v)) != old.v);
 }
 
-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-{
-       unsigned sectors = 0;
-
-       while (!sectors && *idx != j->reservations.idx) {
-               sectors = j->buf[*idx].sectors;
-
-               *idx = (*idx + 1) & JOURNAL_BUF_MASK;
-       }
-
-       return sectors;
-}
-
 static struct journal_space
 journal_dev_space_available(struct journal *j, struct bch_dev *ca,
                            enum journal_space_from from)
 {
        struct journal_device *ja = &ca->journal;
-       unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
+       unsigned sectors, buckets, unwritten;
+       u64 seq;
 
        if (from == journal_space_total)
                return (struct journal_space) {
@@ -94,7 +81,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
         * We that we don't allocate the space for a journal entry
         * until we write it out - thus, account for it here:
         */
-       while ((unwritten = get_unwritten_sectors(j, &idx))) {
+       for (seq = journal_last_unwritten_seq(j);
+            seq <= journal_cur_seq(j);
+            seq++) {
+               unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+               if (!unwritten)
+                       continue;
+
                /* entry won't fit on this device, skip: */
                if (unwritten > ca->mi.bucket_size)
                        continue;
@@ -202,7 +196,7 @@ void bch2_journal_space_available(struct journal *j)
        j->can_discard = can_discard;
 
        if (nr_online < c->opts.metadata_replicas_required) {
-               ret = cur_entry_insufficient_devices;
+               ret = JOURNAL_ERR_insufficient_devices;
                goto out;
        }
 
@@ -216,28 +210,29 @@ void bch2_journal_space_available(struct journal *j)
        total           = j->space[journal_space_total].total;
 
        if (!clean_ondisk &&
-           j->reservations.idx ==
-           j->reservations.unwritten_idx) {
-               char *buf = kmalloc(4096, GFP_ATOMIC);
-
-               bch_err(c, "journal stuck");
-               if (buf) {
-                       __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
-                       pr_err("\n%s", buf);
-                       kfree(buf);
-               }
+           journal_cur_seq(j) == j->seq_ondisk) {
+               struct printbuf buf = PRINTBUF;
 
+               __bch2_journal_debug_to_text(&buf, j);
+               bch_err(c, "journal stuck\n%s", buf.buf);
+               printbuf_exit(&buf);
+
+               /*
+                * Hack: bch2_fatal_error() calls bch2_journal_halt() which
+                * takes journal lock:
+                */
+               spin_unlock(&j->lock);
                bch2_fatal_error(c);
-               ret = cur_entry_journal_stuck;
+               spin_lock(&j->lock);
+
+               ret = JOURNAL_ERR_journal_stuck;
        } else if (!j->space[journal_space_discarded].next_entry)
-               ret = cur_entry_journal_full;
-       else if (!fifo_free(&j->pin))
-               ret = cur_entry_journal_pin_full;
+               ret = JOURNAL_ERR_journal_full;
 
        if ((j->space[journal_space_clean_ondisk].next_entry <
             j->space[journal_space_clean_ondisk].total) &&
            (clean - clean_ondisk <= total / 8) &&
-           (clean_ondisk * 2 > clean ))
+           (clean_ondisk * 2 > clean))
                set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
        else
                clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
@@ -251,7 +246,7 @@ out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
        journal_set_remaining(j, u64s_remaining);
-       journal_check_may_get_unreserved(j);
+       journal_set_watermark(j);
 
        if (!ret)
                journal_wake(j);
@@ -286,12 +281,13 @@ void bch2_journal_do_discards(struct journal *j)
                struct journal_device *ja = &ca->journal;
 
                while (should_discard_bucket(j, ja)) {
-                       if (ca->mi.discard &&
-                           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+                       if (!c->opts.nochanges &&
+                           ca->mi.discard &&
+                           bdev_max_discard_sectors(ca->disk_sb.bdev))
                                blkdev_issue_discard(ca->disk_sb.bdev,
                                        bucket_to_sector(ca,
                                                ja->buckets[ja->discard_idx]),
-                                       ca->mi.bucket_size, GFP_NOIO, 0);
+                                       ca->mi.bucket_size, GFP_NOIO);
 
                        spin_lock(&j->lock);
                        ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -367,15 +363,12 @@ static inline void __journal_pin_drop(struct journal *j,
        list_del_init(&pin->list);
 
        /*
-        * Unpinning a journal entry make make journal_next_bucket() succeed, if
+        * Unpinning a journal entry may make journal_next_bucket() succeed if
         * writing a new last_seq will now make another bucket available:
         */
        if (atomic_dec_and_test(&pin_list->count) &&
            pin_list == &fifo_peek_front(&j->pin))
                bch2_journal_reclaim_fast(j);
-       else if (fifo_used(&j->pin) == 1 &&
-                atomic_read(&pin_list->count) == 1)
-               journal_wake(j);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
@@ -597,7 +590,7 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-static int __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        bool kthread = (current->flags & PF_KTHREAD) != 0;
@@ -646,8 +639,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
                        min_nr = 1;
 
-               trace_journal_reclaim_start(c,
-                               min_nr,
+               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+               trace_and_count(c, journal_reclaim_start, c,
+                               direct, kicked,
+                               min_nr, min_key_cache,
                                j->prereserved.reserved,
                                j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
@@ -655,8 +651,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
                                atomic_long_read(&c->btree_key_cache.nr_keys));
 
-               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
                nr_flushed = journal_flush_pins(j, seq_to_flush,
                                                min_nr, min_key_cache);
 
@@ -664,11 +658,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                        j->nr_direct_reclaim += nr_flushed;
                else
                        j->nr_background_reclaim += nr_flushed;
-               trace_journal_reclaim_finish(c, nr_flushed);
+               trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
 
                if (nr_flushed)
                        wake_up(&j->reclaim_wait);
-       } while ((min_nr || min_key_cache) && !direct);
+       } while ((min_nr || min_key_cache) && nr_flushed && !direct);
 
        memalloc_noreclaim_restore(flags);
 
@@ -677,7 +671,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 int bch2_journal_reclaim(struct journal *j)
 {
-       return __bch2_journal_reclaim(j, true);
+       return __bch2_journal_reclaim(j, true, true);
 }
 
 static int bch2_journal_reclaim_thread(void *arg)
@@ -685,6 +679,7 @@ static int bch2_journal_reclaim_thread(void *arg)
        struct journal *j = arg;
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        unsigned long delay, now;
+       bool journal_empty;
        int ret = 0;
 
        set_freezable();
@@ -692,10 +687,12 @@ static int bch2_journal_reclaim_thread(void *arg)
        j->last_flushed = jiffies;
 
        while (!ret && !kthread_should_stop()) {
+               bool kicked = j->reclaim_kicked;
+
                j->reclaim_kicked = false;
 
                mutex_lock(&j->reclaim_lock);
-               ret = __bch2_journal_reclaim(j, false);
+               ret = __bch2_journal_reclaim(j, false, kicked);
                mutex_unlock(&j->reclaim_lock);
 
                now = jiffies;
@@ -711,10 +708,17 @@ static int bch2_journal_reclaim_thread(void *arg)
                                break;
                        if (j->reclaim_kicked)
                                break;
-                       if (time_after_eq(jiffies, j->next_reclaim))
-                               break;
-                       freezable_schedule_timeout(j->next_reclaim - jiffies);
 
+                       spin_lock(&j->lock);
+                       journal_empty = fifo_empty(&j->pin);
+                       spin_unlock(&j->lock);
+
+                       if (journal_empty)
+                               freezable_schedule();
+                       else if (time_after(j->next_reclaim, jiffies))
+                               freezable_schedule_timeout(j->next_reclaim - jiffies);
+                       else
+                               break;
                }
                __set_current_state(TASK_RUNNING);
        }
@@ -738,15 +742,17 @@ int bch2_journal_reclaim_start(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct task_struct *p;
+       int ret;
 
        if (j->reclaim_thread)
                return 0;
 
        p = kthread_create(bch2_journal_reclaim_thread, j,
                           "bch-reclaim/%s", c->name);
-       if (IS_ERR(p)) {
-               bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
-               return PTR_ERR(p);
+       ret = PTR_ERR_OR_ZERO(p);
+       if (ret) {
+               bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+               return ret;
        }
 
        get_task_struct(p);
@@ -766,7 +772,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
        mutex_lock(&j->reclaim_lock);
 
-       *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
+       if (journal_flush_pins(j, seq_to_flush, 0, 0))
+               *did_work = true;
 
        spin_lock(&j->lock);
        /*
@@ -775,8 +782,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
         */
        ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
                journal_last_seq(j) > seq_to_flush ||
-               (fifo_used(&j->pin) == 1 &&
-                atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+               !fifo_used(&j->pin);
 
        spin_unlock(&j->lock);
        mutex_unlock(&j->reclaim_lock);
@@ -824,10 +830,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
        seq = 0;
 
        spin_lock(&j->lock);
-       while (!ret && seq < j->pin.back) {
+       while (!ret) {
                struct bch_replicas_padded replicas;
 
                seq = max(seq, journal_last_seq(j));
+               if (seq >= j->pin.back)
+                       break;
                bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
                                         journal_seq_pin(j, seq)->devs);
                seq++;
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
new file mode 100644 (file)
index 0000000..c19db04
--- /dev/null
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+       const u64 *l = _l;
+       const u64 *r = _r;
+
+       return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_journal *journal = field_to_type(f, journal);
+       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       int ret = -EINVAL;
+       unsigned nr;
+       unsigned i;
+       u64 *b;
+
+       nr = bch2_nr_journal_buckets(journal);
+       if (!nr)
+               return 0;
+
+       b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       for (i = 0; i < nr; i++)
+               b[i] = le64_to_cpu(journal->buckets[i]);
+
+       sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+       if (!b[0]) {
+               prt_printf(err, "journal bucket at sector 0");
+               goto err;
+       }
+
+       if (b[0] < le16_to_cpu(m->first_bucket)) {
+               prt_printf(err, "journal bucket %llu before first bucket %u",
+                      b[0], le16_to_cpu(m->first_bucket));
+               goto err;
+       }
+
+       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+               prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+                      b[nr - 1], le64_to_cpu(m->nbuckets));
+               goto err;
+       }
+
+       for (i = 0; i + 1 < nr; i++)
+               if (b[i] == b[i + 1]) {
+                       prt_printf(err, "duplicate journal buckets %llu", b[i]);
+                       goto err;
+               }
+
+       ret = 0;
+err:
+       kfree(b);
+       return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_journal *journal = field_to_type(f, journal);
+       unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+       prt_printf(out, "Buckets: ");
+       for (i = 0; i < nr; i++)
+               prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+       prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+       .validate       = bch2_sb_journal_validate,
+       .to_text        = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+       u64     start;
+       u64     end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+       const struct u64_range *l = _l;
+       const struct u64_range *r = _r;
+
+       return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       int ret = -EINVAL;
+       unsigned nr;
+       unsigned i;
+       struct u64_range *b;
+
+       nr = bch2_sb_field_journal_v2_nr_entries(journal);
+       if (!nr)
+               return 0;
+
+       b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       for (i = 0; i < nr; i++) {
+               b[i].start = le64_to_cpu(journal->d[i].start);
+               b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+       }
+
+       sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+       if (!b[0].start) {
+               prt_printf(err, "journal bucket at sector 0");
+               goto err;
+       }
+
+       if (b[0].start < le16_to_cpu(m->first_bucket)) {
+               prt_printf(err, "journal bucket %llu before first bucket %u",
+                      b[0].start, le16_to_cpu(m->first_bucket));
+               goto err;
+       }
+
+       if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+               prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+                      b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+               goto err;
+       }
+
+       for (i = 0; i + 1 < nr; i++) {
+               if (b[i].end > b[i + 1].start) {
+                       prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+                              b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+                       goto err;
+               }
+       }
+
+       ret = 0;
+err:
+       kfree(b);
+       return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+       unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+       prt_printf(out, "Buckets: ");
+       for (i = 0; i < nr; i++)
+               prt_printf(out, " %llu-%llu",
+                      le64_to_cpu(journal->d[i].start),
+                      le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+       prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+       .validate       = bch2_sb_journal_v2_validate,
+       .to_text        = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal_v2 *j;
+       unsigned i, dst = 0, nr = 1;
+
+       if (c)
+               lockdep_assert_held(&c->sb_lock);
+
+       if (!ja->nr) {
+               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+               return 0;
+       }
+
+       for (i = 0; i + 1 < ja->nr; i++)
+               if (ja->buckets[i] + 1 != ja->buckets[i + 1])
+                       nr++;
+
+       j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+                                (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+       if (!j)
+               return -BCH_ERR_ENOSPC_sb_journal;
+
+       bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+       j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+       j->d[dst].nr    = le64_to_cpu(1);
+
+       for (i = 1; i < ja->nr; i++) {
+               if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+                       le64_add_cpu(&j->d[dst].nr, 1);
+               } else {
+                       dst++;
+                       j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+                       j->d[dst].nr    = le64_to_cpu(1);
+               }
+       }
+
+       BUG_ON(dst + 1 != nr);
+
+       return 0;
+}
diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h
new file mode 100644 (file)
index 0000000..a39192e
--- /dev/null
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+       return j
+               ? (__le64 *) vstruct_end(&j->field) - j->buckets
+               : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+       if (!j)
+               return 0;
+
+       return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
index 3cc63fc202ab4cbc83017cb6cad4412720e03797..5c555b3703c0947006176f11ce9736717e9275d9 100644 (file)
@@ -201,7 +201,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
 
                if (le64_to_cpu(e->start) >=
                    le64_to_cpu(e->end)) {
-                       pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+                       prt_printf(err, "entry %u start >= end (%llu >= %llu)",
                               i, le64_to_cpu(e->start), le64_to_cpu(e->end));
                        return -EINVAL;
                }
@@ -209,7 +209,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
                if (i + 1 < nr &&
                    le64_to_cpu(e[0].end) >
                    le64_to_cpu(e[1].start)) {
-                       pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+                       prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
                               i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
                        return -EINVAL;
                }
@@ -229,12 +229,13 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
 
        for (i = bl->start; i < bl->start + nr; i++) {
                if (i != bl->start)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
 
-               pr_buf(out, "%llu-%llu",
+               prt_printf(out, "%llu-%llu",
                       le64_to_cpu(i->start),
                       le64_to_cpu(i->end));
        }
+       prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
@@ -271,7 +272,7 @@ retry:
                       !test_bit(BCH_FS_STOPPING, &c->flags))
                        b = bch2_btree_iter_next_node(&iter);
 
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
                bch2_trans_iter_exit(&trans, &iter);
index d6d7512141167a8f338b19ac64ec9703e9a085f2..a6cdb885ad41077db05e74a837225a0e2158967b 100644 (file)
@@ -25,6 +25,8 @@ struct journal_buf {
 
        struct closure_waitlist wait;
        u64                     last_seq;       /* copy of data->last_seq */
+       long                    expires;
+       u64                     flush_time;
 
        unsigned                buf_size;       /* size in bytes of @data */
        unsigned                sectors;        /* maximum size for current entry */
@@ -139,19 +141,39 @@ enum journal_space_from {
        journal_space_nr,
 };
 
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
 enum {
        JOURNAL_REPLAY_DONE,
        JOURNAL_STARTED,
-       JOURNAL_NEED_WRITE,
-       JOURNAL_MAY_GET_UNRESERVED,
        JOURNAL_MAY_SKIP_FLUSH,
-       JOURNAL_NOCHANGES,
+};
+
+#define JOURNAL_WATERMARKS()           \
+       x(any)                          \
+       x(copygc)                       \
+       x(reserved)
+
+enum journal_watermark {
+#define x(n)   JOURNAL_WATERMARK_##n,
+       JOURNAL_WATERMARKS()
+#undef x
+};
+
+#define JOURNAL_WATERMARK_MASK 3
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()               \
+       x(ok)                           \
+       x(blocked)                      \
+       x(max_in_flight)                \
+       x(journal_full)                 \
+       x(journal_pin_full)             \
+       x(journal_stuck)                \
+       x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)   JOURNAL_ERR_##n,
+       JOURNAL_ERRORS()
+#undef x
 };
 
 /* Embedded in struct bch_fs */
@@ -161,6 +183,7 @@ struct journal {
        unsigned long           flags;
 
        union journal_res_state reservations;
+       enum journal_watermark  watermark;
 
        /* Max size of current journal entry */
        unsigned                cur_entry_u64s;
@@ -170,14 +193,7 @@ struct journal {
         * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
         * insufficient devices:
         */
-       enum {
-               cur_entry_ok,
-               cur_entry_blocked,
-               cur_entry_journal_full,
-               cur_entry_journal_pin_full,
-               cur_entry_journal_stuck,
-               cur_entry_insufficient_devices,
-       }                       cur_entry_error;
+       enum journal_errors     cur_entry_error;
 
        union journal_preres_state prereserved;
 
@@ -245,6 +261,10 @@ struct journal {
        spinlock_t              err_lock;
 
        struct mutex            reclaim_lock;
+       /*
+        * Used for waiting until journal reclaim has freed up space in the
+        * journal:
+        */
        wait_queue_head_t       reclaim_wait;
        struct task_struct      *reclaim_thread;
        bool                    reclaim_kicked;
@@ -264,7 +284,6 @@ struct journal {
        unsigned long           last_flush_write;
 
        u64                     res_get_blocked_start;
-       u64                     need_write_time;
        u64                     write_start_time;
 
        u64                     nr_flush_writes;
index cda77835b9ea62381f3962a1d0029d463fe3b2b1..5e85055b0f9382df6ef9ababd31975c16f50cd98 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey.h"
 #include "keylist.h"
 
 int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
new file mode 100644 (file)
index 0000000..53e607d
--- /dev/null
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                    int rw, struct printbuf *err)
+{
+       const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+       if (bkey_val_bytes(k.k) < sizeof(*lru)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*lru));
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+                     struct bkey_s_c k)
+{
+       const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+       prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
+                   struct bkey_s_c orig_k)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 existing_idx;
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       if (!time)
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+                            POS(id, time),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_lru) {
+               bch2_bkey_val_to_text(&buf, trans->c, orig_k);
+               bch2_trans_inconsistent(trans,
+                       "pointer to nonexistent lru %llu:%llu\n%s",
+                       id, time, buf.buf);
+               ret = -EIO;
+               goto err;
+       }
+
+       existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+       if (existing_idx != idx) {
+               bch2_bkey_val_to_text(&buf, trans->c, orig_k);
+               bch2_trans_inconsistent(trans,
+                       "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
+                       id, time, existing_idx, idx, buf.buf);
+               ret = -EIO;
+               goto err;
+       }
+
+       ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
+       return ret;
+}
+
+int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_lru *lru;
+       int ret = 0;
+
+       if (!*time)
+               return 0;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
+                       POS(lru_id, *time),
+                       BTREE_ITER_SLOTS|
+                       BTREE_ITER_INTENT|
+                       BTREE_ITER_WITH_UPDATES, k, ret)
+               if (bkey_deleted(k.k))
+                       break;
+
+       if (ret)
+               goto err;
+
+       BUG_ON(iter.pos.inode != lru_id);
+       *time = iter.pos.offset;
+
+       lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+       ret = PTR_ERR_OR_ZERO(lru);
+       if (ret)
+               goto err;
+
+       bkey_lru_init(&lru->k_i);
+       lru->k.p        = iter.pos;
+       lru->v.idx      = cpu_to_le64(idx);
+
+       ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
+                   u64 old_time, u64 *new_time,
+                   struct bkey_s_c k)
+{
+       if (old_time == *new_time)
+               return 0;
+
+       return  bch2_lru_delete(trans, id, idx, old_time, k) ?:
+               bch2_lru_set(trans, id, idx, new_time);
+}
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+                             struct btree_iter *lru_iter,
+                             struct bkey_s_c lru_k)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_alloc_v4 a;
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       struct bpos alloc_pos;
+       int ret;
+
+       alloc_pos = POS(lru_k.k->p.inode,
+                       le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
+
+       if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+                       "lru key points to nonexistent device:bucket %llu:%llu",
+                       alloc_pos.inode, alloc_pos.offset))
+               return bch2_btree_delete_at(trans, lru_iter, 0);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       bch2_alloc_to_v4(k, &a);
+
+       if (fsck_err_on(a.data_type != BCH_DATA_cached ||
+                       a.io_time[READ] != lru_k.k->p.offset, c,
+                       "incorrect lru entry %s\n"
+                       "  for %s",
+                       (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
+
+               bkey_init(&update->k);
+               update->k.p = lru_iter->pos;
+
+               ret = bch2_trans_update(trans, lru_iter, update, 0);
+               if (ret)
+                       goto err;
+       }
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
+       return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+               bch2_check_lru_key(&trans, &iter, k));
+
+       bch2_trans_exit(&trans);
+       return ret;
+
+}
diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h
new file mode 100644 (file)
index 0000000..3decb7b
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_lru (struct bkey_ops) {  \
+       .key_invalid    = bch2_lru_invalid,     \
+       .val_to_text    = bch2_lru_to_text,     \
+}
+
+int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
+int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
index 6defc33322b3b24bd5f9a076165accce6f204aa0..8b258d966d042f73e9448a3086373d8d8a1d6b5f 100644 (file)
@@ -8,6 +8,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
+#include "errcode.h"
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
@@ -35,85 +36,76 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
        return 0;
 }
 
-static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
-                                  enum btree_id btree_id)
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+                                    struct btree_iter *iter,
+                                    struct bkey_s_c k,
+                                    unsigned dev_idx,
+                                    int flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *n;
+       int ret;
+
+       if (!bch2_bkey_has_device(k, dev_idx))
+               return 0;
+
+       n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
+
+       bkey_reassemble(n, k);
+
+       ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+       if (ret)
+               return ret;
+
+       /*
+        * If the new extent no longer has any pointers, bch2_extent_normalize()
+        * will do the appropriate thing with it (turning it into a
+        * KEY_TYPE_error key, or just a discard if it was a cached extent)
+        */
+       bch2_extent_normalize(c, bkey_i_to_s(n));
+
+       /*
+        * Since we're not inserting through an extent iterator
+        * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+        * we aren't using the extent overwrite path to delete, we're
+        * just using the normal key deletion path:
+        */
+       if (bkey_deleted(&n->k))
+               n->k.size = 0;
+
+       return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_buf sk;
+       enum btree_id id;
        int ret = 0;
 
-       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       while ((bch2_trans_begin(&trans),
-               (k = bch2_btree_iter_peek(&iter)).k) &&
-              !(ret = bkey_err(k))) {
-               if (!bch2_bkey_has_device(k, dev_idx)) {
-                       bch2_btree_iter_advance(&iter);
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               if (!btree_type_has_ptrs(id))
                        continue;
-               }
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
 
-               ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
-                                   dev_idx, flags, false);
-               if (ret)
-                       break;
-
-               /*
-                * If the new extent no longer has any pointers, bch2_extent_normalize()
-                * will do the appropriate thing with it (turning it into a
-                * KEY_TYPE_error key, or just a discard if it was a cached extent)
-                */
-               bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
-               /*
-                * Since we're not inserting through an extent iterator
-                * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
-                * we aren't using the extent overwrite path to delete, we're
-                * just using the normal key deletion path:
-                */
-               if (bkey_deleted(&sk.k->k))
-                       sk.k->k.size = 0;
-
-               ret   = bch2_btree_iter_traverse(&iter) ?:
-                       bch2_trans_update(&trans, &iter, sk.k,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-                       bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
-
-               /*
-                * don't want to leave ret == -EINTR, since if we raced and
-                * something else overwrote the key we could spuriously return
-                * -EINTR below:
-                */
-               if (ret == -EINTR)
-                       ret = 0;
+               ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                       bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-
-       BUG_ON(ret == -EINTR);
 
        return ret;
 }
 
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
-{
-       return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
-               __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
-}
-
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
        struct btree_trans trans;
@@ -154,19 +146,20 @@ retry:
                        }
 
                        ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
-                       if (ret == -EINTR) {
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                                ret = 0;
                                continue;
                        }
 
                        if (ret) {
-                               bch_err(c, "Error updating btree node key: %i", ret);
+                               bch_err(c, "Error updating btree node key: %s",
+                                       bch2_err_str(ret));
                                break;
                        }
 next:
                        bch2_btree_iter_next_node(&iter);
                }
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
                bch2_trans_iter_exit(&trans, &iter);
@@ -175,16 +168,13 @@ next:
                        goto err;
        }
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-
+       bch2_btree_interior_updates_flush(c);
        ret = 0;
 err:
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&k, c);
 
-       BUG_ON(ret == -EINTR);
+       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
        return ret;
 }
index 7ca7ce394135cef68ed81420a1d064a2148b1321..7486920475f0c6b2b909e014dcb10d79d8265570 100644 (file)
@@ -2,19 +2,20 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
-#include "buckets.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "errcode.h"
+#include "error.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
 #include "move.h"
 #include "replicas.h"
-#include "subvolume.h"
 #include "super-io.h"
 #include "keylist.h"
 
 
 #include <trace/events/bcachefs.h>
 
-#define SECTORS_IN_FLIGHT_PER_DEVICE   2048
+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_add(&stats->list, &c->data_progress_list);
+       mutex_unlock(&c->data_progress_lock);
+}
+
+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_del(&stats->list);
+       mutex_unlock(&c->data_progress_lock);
+}
 
 struct moving_io {
        struct list_head        list;
@@ -35,415 +48,30 @@ struct moving_io {
 
        struct bch_read_bio     rbio;
 
-       struct migrate_write    write;
+       struct data_update      write;
        /* Must be last since it is variable size */
        struct bio_vec          bi_inline_vecs[0];
 };
 
-struct moving_context {
-       /* Closure for waiting on all reads and writes to complete */
-       struct closure          cl;
-
-       struct bch_move_stats   *stats;
-
-       struct list_head        reads;
-
-       /* in flight sectors: */
-       atomic_t                read_sectors;
-       atomic_t                write_sectors;
-
-       wait_queue_head_t       wait;
-};
-
-static int insert_snapshot_whiteouts(struct btree_trans *trans,
-                                    enum btree_id id,
-                                    struct bpos old_pos,
-                                    struct bpos new_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter, update_iter;
-       struct bkey_s_c k;
-       struct snapshots_seen s;
-       int ret;
-
-       if (!btree_type_has_snapshots(id))
-               return 0;
-
-       snapshots_seen_init(&s);
-
-       if (!bkey_cmp(old_pos, new_pos))
-               return 0;
-
-       if (!snapshot_t(c, old_pos.snapshot)->children[0])
-               return 0;
-
-       bch2_trans_iter_init(trans, &iter, id, old_pos,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       while (1) {
-next:
-               k = bch2_btree_iter_prev(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               if (bkey_cmp(old_pos, k.k->p))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
-                       struct bkey_i *update;
-                       size_t i;
-
-                       for (i = 0; i < s.nr; i++)
-                               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
-                                       goto next;
-
-                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-
-                       ret = PTR_ERR_OR_ZERO(update);
-                       if (ret)
-                               break;
-
-                       bkey_init(&update->k);
-                       update->k.p = new_pos;
-                       update->k.p.snapshot = k.k->p.snapshot;
-
-                       bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
-                                            BTREE_ITER_NOT_EXTENTS|
-                                            BTREE_ITER_ALL_SNAPSHOTS|
-                                            BTREE_ITER_INTENT);
-                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
-                               bch2_trans_update(trans, &update_iter, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-                       bch2_trans_iter_exit(trans, &update_iter);
-                       if (ret)
-                               break;
-
-                       ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
-                       if (ret)
-                               break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-       kfree(s.d);
-
-       return ret;
-}
-
-static int bch2_migrate_index_update(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct migrate_write *m =
-               container_of(op, struct migrate_write, op);
-       struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
-       struct keylist *keys = &op->insert_keys;
-       struct bkey_buf _new, _insert;
-       int ret = 0;
-
-       bch2_bkey_buf_init(&_new);
-       bch2_bkey_buf_init(&_insert);
-       bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-       bch2_trans_iter_init(&trans, &iter, m->btree_id,
-                            bkey_start_pos(&bch2_keylist_front(keys)->k),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-       while (1) {
-               struct bkey_s_c k;
-               struct bkey_i *insert;
-               struct bkey_i_extent *new;
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-               struct bpos next_pos;
-               bool did_work = false;
-               bool should_check_enospc;
-               s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-
-               bch2_trans_begin(&trans);
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               new = bkey_i_to_extent(bch2_keylist_front(keys));
-
-               if (bversion_cmp(k.k->version, new->k.version) ||
-                   !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
-                       goto nomatch;
-
-               bkey_reassemble(_insert.k, k);
-               insert = _insert.k;
-
-               bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
-               new = bkey_i_to_extent(_new.k);
-               bch2_cut_front(iter.pos, &new->k_i);
-
-               bch2_cut_front(iter.pos,        insert);
-               bch2_cut_back(new->k.p,         insert);
-               bch2_cut_back(insert->k.p,      &new->k_i);
-
-               if (m->data_cmd == DATA_REWRITE) {
-                       struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
-                               bch2_bkey_has_device(bkey_i_to_s_c(insert),
-                                                    m->data_opts.rewrite_dev);
-                       if (!old_ptr)
-                               goto nomatch;
-
-                       if (old_ptr->cached)
-                               extent_for_each_ptr(extent_i_to_s(new), new_ptr)
-                                       new_ptr->cached = true;
-
-                       __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
-               }
-
-               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-                       if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
-                               /*
-                                * raced with another move op? extent already
-                                * has a pointer to the device we just wrote
-                                * data to
-                                */
-                               continue;
-                       }
-
-                       bch2_extent_ptr_decoded_append(insert, &p);
-                       did_work = true;
-               }
-
-               if (!did_work)
-                       goto nomatch;
-
-               bch2_bkey_narrow_crcs(insert,
-                               (struct bch_extent_crc_unpacked) { 0 });
-               bch2_extent_normalize(c, bkey_i_to_s(insert));
-               bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
-                                              op->opts.background_target,
-                                              op->opts.data_replicas);
-
-               ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
-                                                &should_check_enospc,
-                                                &i_sectors_delta,
-                                                &disk_sectors_delta);
-               if (ret)
-                       goto err;
-
-               if (disk_sectors_delta > (s64) op->res.sectors) {
-                       ret = bch2_disk_reservation_add(c, &op->res,
-                                               disk_sectors_delta - op->res.sectors,
-                                               !should_check_enospc
-                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
-                       if (ret)
-                               goto out;
-               }
-
-               next_pos = insert->k.p;
-
-               ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
-                                                 k.k->p, insert->k.p) ?:
-                       bch2_trans_update(&trans, &iter, insert,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-                       bch2_trans_commit(&trans, &op->res,
-                               op_journal_seq(op),
-                               BTREE_INSERT_NOFAIL|
-                               m->data_opts.btree_insert_flags);
-               if (!ret) {
-                       bch2_btree_iter_set_pos(&iter, next_pos);
-                       atomic_long_inc(&c->extent_migrate_done);
-                       if (ec_ob)
-                               bch2_ob_add_backpointer(c, ec_ob, &insert->k);
-               }
-err:
-               if (ret == -EINTR)
-                       ret = 0;
-               if (ret)
-                       break;
-next:
-               while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
-                       bch2_keylist_pop_front(keys);
-                       if (bch2_keylist_empty(keys))
-                               goto out;
-               }
-               continue;
-nomatch:
-               if (m->ctxt) {
-                       BUG_ON(k.k->p.offset <= iter.pos.offset);
-                       atomic64_inc(&m->ctxt->stats->keys_raced);
-                       atomic64_add(k.k->p.offset - iter.pos.offset,
-                                    &m->ctxt->stats->sectors_raced);
-               }
-               atomic_long_inc(&c->extent_migrate_raced);
-               trace_move_race(&new->k);
-               bch2_btree_iter_advance(&iter);
-               goto next;
-       }
-out:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&_insert, c);
-       bch2_bkey_buf_exit(&_new, c);
-       BUG_ON(ret == -EINTR);
-       return ret;
-}
-
-void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
-{
-       /* write bio must own pages: */
-       BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
-       m->ptr          = rbio->pick.ptr;
-       m->offset       = rbio->data_pos.offset - rbio->pick.crc.offset;
-       m->op.devs_have = rbio->devs_have;
-       m->op.pos       = rbio->data_pos;
-       m->op.version   = rbio->version;
-       m->op.crc       = rbio->pick.crc;
-       m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
-       if (m->data_cmd == DATA_REWRITE)
-               bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
-}
-
-int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
-                           struct write_point_specifier wp,
-                           struct bch_io_opts io_opts,
-                           enum data_cmd data_cmd,
-                           struct data_opts data_opts,
-                           enum btree_id btree_id,
-                           struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       struct extent_ptr_decoded p;
-       int ret;
-
-       m->btree_id     = btree_id;
-       m->data_cmd     = data_cmd;
-       m->data_opts    = data_opts;
-       m->nr_ptrs_reserved = 0;
-
-       bch2_write_op_init(&m->op, c, io_opts);
-
-       if (!bch2_bkey_is_incompressible(k))
-               m->op.compression_type =
-                       bch2_compression_opt_to_type[io_opts.background_compression ?:
-                                                    io_opts.compression];
-       else
-               m->op.incompressible = true;
-
-       m->op.target    = data_opts.target,
-       m->op.write_point = wp;
-
-       /*
-        * op->csum_type is normally initialized from the fs/file's current
-        * options - but if an extent is encrypted, we require that it stays
-        * encrypted:
-        */
-       bkey_for_each_crc(k.k, ptrs, crc, entry)
-               if (bch2_csum_type_is_encryption(crc.csum_type)) {
-                       m->op.nonce     = crc.nonce + crc.offset;
-                       m->op.csum_type = crc.csum_type;
-                       break;
-               }
-
-       if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
-               m->op.alloc_reserve = RESERVE_MOVINGGC;
-               m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
-       } else {
-               /* XXX: this should probably be passed in */
-               m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
-       }
-
-       m->op.flags |= BCH_WRITE_PAGES_STABLE|
-               BCH_WRITE_PAGES_OWNED|
-               BCH_WRITE_DATA_ENCODED|
-               BCH_WRITE_FROM_INTERNAL;
-
-       m->op.nr_replicas       = data_opts.nr_replicas;
-       m->op.nr_replicas_required = data_opts.nr_replicas;
-       m->op.index_update_fn   = bch2_migrate_index_update;
-
-       switch (data_cmd) {
-       case DATA_ADD_REPLICAS: {
-               /*
-                * DATA_ADD_REPLICAS is used for moving data to a different
-                * device in the background, and due to compression the new copy
-                * might take up more space than the old copy:
-                */
-#if 0
-               int nr = (int) io_opts.data_replicas -
-                       bch2_bkey_nr_ptrs_allocated(k);
-#endif
-               int nr = (int) io_opts.data_replicas;
-
-               if (nr > 0) {
-                       m->op.nr_replicas = m->nr_ptrs_reserved = nr;
-
-                       ret = bch2_disk_reservation_get(c, &m->op.res,
-                                       k.k->size, m->op.nr_replicas, 0);
-                       if (ret)
-                               return ret;
-               }
-               break;
-       }
-       case DATA_REWRITE: {
-               unsigned compressed_sectors = 0;
-
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (p.ptr.dev == data_opts.rewrite_dev) {
-                               if (p.ptr.cached)
-                                       m->op.flags |= BCH_WRITE_CACHED;
-
-                               if (!p.ptr.cached &&
-                                   crc_is_compressed(p.crc))
-                                       compressed_sectors += p.crc.compressed_size;
-                       }
-
-               if (compressed_sectors) {
-                       ret = bch2_disk_reservation_add(c, &m->op.res,
-                                       k.k->size * m->op.nr_replicas,
-                                       BCH_DISK_RESERVATION_NOFAIL);
-                       if (ret)
-                               return ret;
-               }
-               break;
-       }
-       case DATA_PROMOTE:
-               m->op.flags     |= BCH_WRITE_ALLOC_NOWAIT;
-               m->op.flags     |= BCH_WRITE_CACHED;
-               break;
-       default:
-               BUG();
-       }
-
-       return 0;
-}
-
 static void move_free(struct closure *cl)
 {
        struct moving_io *io = container_of(cl, struct moving_io, cl);
        struct moving_context *ctxt = io->write.ctxt;
-       struct bvec_iter_all iter;
-       struct bio_vec *bv;
-
-       bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
-
-       bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
-               if (bv->bv_page)
-                       __free_page(bv->bv_page);
+       struct bch_fs *c = ctxt->c;
 
+       bch2_data_update_exit(&io->write);
        wake_up(&ctxt->wait);
-
+       percpu_ref_put(&c->writes);
        kfree(io);
 }
 
 static void move_write_done(struct closure *cl)
 {
        struct moving_io *io = container_of(cl, struct moving_io, cl);
+       struct moving_context *ctxt = io->write.ctxt;
+
+       if (io->write.op.error)
+               ctxt->write_error = true;
 
        atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
        closure_return_with_destructor(cl, move_free);
@@ -458,10 +86,9 @@ static void move_write(struct closure *cl)
                return;
        }
 
-       bch2_migrate_read_done(&io->write, &io->rbio);
-
        atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-       closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+
+       bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
        continue_at(cl, move_write_done, NULL);
 }
 
@@ -481,9 +108,7 @@ static void move_read_endio(struct bio *bio)
        atomic_sub(io->read_sectors, &ctxt->read_sectors);
        io->read_completed = true;
 
-       if (next_pending_write(ctxt))
-               wake_up(&ctxt->wait);
-
+       wake_up(&ctxt->wait);
        closure_put(&ctxt->cl);
 }
 
@@ -520,14 +145,103 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
                atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+       move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+       closure_sync(&ctxt->cl);
+       EBUG_ON(atomic_read(&ctxt->write_sectors));
+
+       if (ctxt->stats) {
+               progress_list_del(ctxt->c, ctxt->stats);
+
+               trace_move_data(ctxt->c,
+                               atomic64_read(&ctxt->stats->sectors_moved),
+                               atomic64_read(&ctxt->stats->keys_moved));
+       }
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+                          struct bch_fs *c,
+                          struct bch_ratelimit *rate,
+                          struct bch_move_stats *stats,
+                          struct write_point_specifier wp,
+                          bool wait_on_copygc)
+{
+       memset(ctxt, 0, sizeof(*ctxt));
+
+       ctxt->c         = c;
+       ctxt->rate      = rate;
+       ctxt->stats     = stats;
+       ctxt->wp        = wp;
+       ctxt->wait_on_copygc = wait_on_copygc;
+
+       closure_init_stack(&ctxt->cl);
+       INIT_LIST_HEAD(&ctxt->reads);
+       init_waitqueue_head(&ctxt->wait);
+
+       if (stats) {
+               progress_list_add(c, stats);
+               stats->data_type = BCH_DATA_user;
+       }
+}
+
+void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+       memset(stats, 0, sizeof(*stats));
+       scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+static int bch2_extent_drop_ptrs(struct btree_trans *trans,
+                                struct btree_iter *iter,
+                                struct bkey_s_c k,
+                                struct data_update_opts data_opts)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *n;
+       int ret;
+
+       n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
+
+       bkey_reassemble(n, k);
+
+       while (data_opts.kill_ptrs) {
+               unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+               struct bch_extent_ptr *ptr;
+
+               bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+               data_opts.kill_ptrs ^= 1U << drop;
+       }
+
+       /*
+        * If the new extent no longer has any pointers, bch2_extent_normalize()
+        * will do the appropriate thing with it (turning it into a
+        * KEY_TYPE_error key, or just a discard if it was a cached extent)
+        */
+       bch2_extent_normalize(c, bkey_i_to_s(n));
+
+       /*
+        * Since we're not inserting through an extent iterator
+        * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+        * we aren't using the extent overwrite path to delete, we're
+        * just using the normal key deletion path:
+        */
+       if (bkey_deleted(&n->k))
+               n->k.size = 0;
+
+       return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
 static int bch2_move_extent(struct btree_trans *trans,
+                           struct btree_iter *iter,
                            struct moving_context *ctxt,
-                           struct write_point_specifier wp,
                            struct bch_io_opts io_opts,
                            enum btree_id btree_id,
                            struct bkey_s_c k,
-                           enum data_cmd data_cmd,
-                           struct data_opts data_opts)
+                           struct data_update_opts data_opts)
 {
        struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -537,6 +251,18 @@ static int bch2_move_extent(struct btree_trans *trans,
        unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
+       bch2_data_update_opts_normalize(k, &data_opts);
+
+       if (!data_opts.rewrite_ptrs &&
+           !data_opts.extra_replicas) {
+               if (data_opts.kill_ptrs)
+                       return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+               return 0;
+       }
+
+       if (!percpu_ref_tryget_live(&c->writes))
+               return -EROFS;
+
        /* write path might have to decompress data: */
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -551,7 +277,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        io->read_sectors        = k.k->size;
        io->write_sectors       = k.k->size;
 
-       bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+       bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
        bio_set_prio(&io->write.op.wbio.bio,
                     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
@@ -561,7 +287,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 
        io->rbio.c              = c;
        io->rbio.opts           = io_opts;
-       bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+       bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
        io->rbio.bio.bi_vcnt = pages;
        bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
        io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -570,15 +296,18 @@ static int bch2_move_extent(struct btree_trans *trans,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-                                     data_cmd, data_opts, btree_id, k);
+       ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
+                                   data_opts, btree_id, k);
        if (ret)
                goto err_free_pages;
 
+       io->write.ctxt = ctxt;
+
        atomic64_inc(&ctxt->stats->keys_moved);
        atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
-
-       trace_move_extent(k.k);
+       this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+       this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+       trace_move_extent_read(k.k);
 
        atomic_add(io->read_sectors, &ctxt->read_sectors);
        list_add_tail(&io->list, &ctxt->reads);
@@ -599,7 +328,8 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       trace_move_alloc_fail(k.k);
+       percpu_ref_put(&c->writes);
+       trace_and_count(c, move_extent_alloc_mem_fail, k.k);
        return ret;
 }
 
@@ -634,72 +364,108 @@ err:
        return ret;
 }
 
-static int __bch2_move_data(struct bch_fs *c,
-               struct moving_context *ctxt,
-               struct bch_ratelimit *rate,
-               struct write_point_specifier wp,
-               struct bpos start,
-               struct bpos end,
-               move_pred_fn pred, void *arg,
-               struct bch_move_stats *stats,
-               enum btree_id btree_id)
+static int move_ratelimit(struct btree_trans *trans,
+                         struct moving_context *ctxt)
 {
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       struct bch_fs *c = trans->c;
+       u64 delay;
+
+       if (ctxt->wait_on_copygc) {
+               bch2_trans_unlock(trans);
+               wait_event_killable(c->copygc_running_wq,
+                                   !c->copygc_running ||
+                                   kthread_should_stop());
+       }
+
+       do {
+               delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+               if (delay) {
+                       bch2_trans_unlock(trans);
+                       set_current_state(TASK_INTERRUPTIBLE);
+               }
+
+               if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+                       __set_current_state(TASK_RUNNING);
+                       return 1;
+               }
+
+               if (delay)
+                       schedule_timeout(delay);
+
+               if (unlikely(freezing(current))) {
+                       move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+                       try_to_freeze();
+               }
+       } while (delay);
+
+       move_ctxt_wait_event(ctxt, trans,
+               atomic_read(&ctxt->write_sectors) <
+               c->opts.move_bytes_in_flight >> 9);
+
+       move_ctxt_wait_event(ctxt, trans,
+               atomic_read(&ctxt->read_sectors) <
+               c->opts.move_bytes_in_flight >> 9);
+
+       return 0;
+}
+
+static int move_get_io_opts(struct btree_trans *trans,
+                           struct bch_io_opts *io_opts,
+                           struct bkey_s_c k, u64 *cur_inum)
+{
+       struct bch_inode_unpacked inode;
+       int ret;
+
+       if (*cur_inum == k.k->p.inode)
+               return 0;
+
+       *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+
+       ret = lookup_inode(trans,
+                          SPOS(0, k.k->p.inode, k.k->p.snapshot),
+                          &inode);
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               return ret;
+
+       if (!ret)
+               bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode));
+
+       *cur_inum = k.k->p.inode;
+       return 0;
+}
+
+static int __bch2_move_data(struct moving_context *ctxt,
+                           struct bpos start,
+                           struct bpos end,
+                           move_pred_fn pred, void *arg,
+                           enum btree_id btree_id)
+{
+       struct bch_fs *c = ctxt->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct bkey_buf sk;
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct data_opts data_opts;
-       enum data_cmd data_cmd;
-       u64 delay, cur_inum = U64_MAX;
+       struct data_update_opts data_opts;
+       u64 cur_inum = U64_MAX;
        int ret = 0, ret2;
 
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
-       stats->data_type = BCH_DATA_user;
-       stats->btree_id = btree_id;
-       stats->pos      = start;
+       ctxt->stats->data_type  = BCH_DATA_user;
+       ctxt->stats->btree_id   = btree_id;
+       ctxt->stats->pos        = start;
 
        bch2_trans_iter_init(&trans, &iter, btree_id, start,
                             BTREE_ITER_PREFETCH|
                             BTREE_ITER_ALL_SNAPSHOTS);
 
-       if (rate)
-               bch2_ratelimit_reset(rate);
-
-       while (1) {
-               do {
-                       delay = rate ? bch2_ratelimit_delay(rate) : 0;
-
-                       if (delay) {
-                               bch2_trans_unlock(&trans);
-                               set_current_state(TASK_INTERRUPTIBLE);
-                       }
-
-                       if (kthread && (ret = kthread_should_stop())) {
-                               __set_current_state(TASK_RUNNING);
-                               goto out;
-                       }
-
-                       if (delay)
-                               schedule_timeout(delay);
-
-                       if (unlikely(freezing(current))) {
-                               move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
-                               try_to_freeze();
-                       }
-               } while (delay);
-
-               move_ctxt_wait_event(ctxt, &trans,
-                       atomic_read(&ctxt->write_sectors) <
-                       SECTORS_IN_FLIGHT_PER_DEVICE);
-
-               move_ctxt_wait_event(ctxt, &trans,
-                       atomic_read(&ctxt->read_sectors) <
-                       SECTORS_IN_FLIGHT_PER_DEVICE);
+       if (ctxt->rate)
+               bch2_ratelimit_reset(ctxt->rate);
 
+       while (!move_ratelimit(&trans, ctxt)) {
                bch2_trans_begin(&trans);
 
                k = bch2_btree_iter_peek(&iter);
@@ -707,7 +473,7 @@ static int __bch2_move_data(struct bch_fs *c,
                        break;
 
                ret = bkey_err(k);
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
                if (ret)
                        break;
@@ -715,53 +481,30 @@ static int __bch2_move_data(struct bch_fs *c,
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               stats->pos = iter.pos;
+               ctxt->stats->pos = iter.pos;
 
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
-               if (btree_id == BTREE_ID_extents &&
-                   cur_inum != k.k->p.inode) {
-                       struct bch_inode_unpacked inode;
-
-                       io_opts = bch2_opts_to_inode_opts(c->opts);
-
-                       ret = lookup_inode(&trans,
-                                       SPOS(0, k.k->p.inode, k.k->p.snapshot),
-                                       &inode);
-                       if (ret == -EINTR)
-                               continue;
-
-                       if (!ret)
-                               bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
-
-                       cur_inum = k.k->p.inode;
-               }
+               ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+               if (ret)
+                       continue;
 
-               switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
-               case DATA_SKIP:
+               memset(&data_opts, 0, sizeof(data_opts));
+               if (!pred(c, arg, k, &io_opts, &data_opts))
                        goto next;
-               case DATA_SCRUB:
-                       BUG();
-               case DATA_ADD_REPLICAS:
-               case DATA_REWRITE:
-               case DATA_PROMOTE:
-                       break;
-               default:
-                       BUG();
-               }
 
                /*
                 * The iterator gets unlocked by __bch2_read_extent - need to
                 * save a copy of @k elsewhere:
-                 */
+                */
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
 
-               ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
-                                       data_cmd, data_opts);
+               ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+                                       btree_id, k, data_opts);
                if (ret2) {
-                       if (ret2 == -EINTR)
+                       if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
                                continue;
 
                        if (ret2 == -ENOMEM) {
@@ -774,14 +517,13 @@ static int __bch2_move_data(struct bch_fs *c,
                        goto next;
                }
 
-               if (rate)
-                       bch2_ratelimit_increment(rate, k.k->size);
+               if (ctxt->rate)
+                       bch2_ratelimit_increment(ctxt->rate, k.k->size);
 next:
-               atomic64_add(k.k->size, &stats->sectors_seen);
+               atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 next_nondata:
                bch2_btree_iter_advance(&iter);
        }
-out:
 
        bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
@@ -790,48 +532,20 @@ out:
        return ret;
 }
 
-inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
-{
-       memset(stats, 0, sizeof(*stats));
-
-       scnprintf(stats->name, sizeof(stats->name),
-                       "%s", name);
-}
-
-static inline void progress_list_add(struct bch_fs *c,
-                                    struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_add(&stats->list, &c->data_progress_list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
-static inline void progress_list_del(struct bch_fs *c,
-                                    struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_del(&stats->list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
 int bch2_move_data(struct bch_fs *c,
                   enum btree_id start_btree_id, struct bpos start_pos,
                   enum btree_id end_btree_id,   struct bpos end_pos,
                   struct bch_ratelimit *rate,
+                  struct bch_move_stats *stats,
                   struct write_point_specifier wp,
-                  move_pred_fn pred, void *arg,
-                  struct bch_move_stats *stats)
+                  bool wait_on_copygc,
+                  move_pred_fn pred, void *arg)
 {
-       struct moving_context ctxt = { .stats = stats };
+       struct moving_context ctxt;
        enum btree_id id;
        int ret;
 
-       progress_list_add(c, stats);
-       closure_init_stack(&ctxt.cl);
-       INIT_LIST_HEAD(&ctxt.reads);
-       init_waitqueue_head(&ctxt.wait);
-
-       stats->data_type = BCH_DATA_user;
+       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
        for (id = start_btree_id;
             id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
@@ -842,31 +556,205 @@ int bch2_move_data(struct bch_fs *c,
                    id != BTREE_ID_reflink)
                        continue;
 
-               ret = __bch2_move_data(c, &ctxt, rate, wp,
+               ret = __bch2_move_data(&ctxt,
                                       id == start_btree_id ? start_pos : POS_MIN,
                                       id == end_btree_id   ? end_pos   : POS_MAX,
-                                      pred, arg, stats, id);
+                                      pred, arg, id);
                if (ret)
                        break;
        }
 
+       bch2_moving_ctxt_exit(&ctxt);
 
-       move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
-       closure_sync(&ctxt.cl);
+       return ret;
+}
+
+static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            bucket, BTREE_ITER_CACHED);
+again:
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
 
-       EBUG_ON(atomic_read(&ctxt.write_sectors));
+       if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
+               struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
-       trace_move_data(c,
-                       atomic64_read(&stats->sectors_moved),
-                       atomic64_read(&stats->keys_moved));
+               if (a.v->gen == gen &&
+                   a.v->dirty_sectors) {
+                       struct printbuf buf = PRINTBUF;
+
+                       if (a.v->data_type == BCH_DATA_btree) {
+                               bch2_trans_unlock(trans);
+                               if (bch2_btree_interior_updates_flush(c))
+                                       goto again;
+                       }
+
+                       prt_str(&buf, "failed to evacuate bucket ");
+                       bch2_bkey_val_to_text(&buf, c, k);
+
+                       bch_err(c, "%s", buf.buf);
+                       printbuf_exit(&buf);
+               }
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
+                          struct bpos bucket, int gen,
+                          struct data_update_opts _data_opts)
+{
+       struct bch_fs *c = ctxt->c;
+       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       struct bch_backpointer bp;
+       struct data_update_opts data_opts;
+       u64 bp_offset = 0, cur_inum = U64_MAX;
+       int ret = 0;
+
+       bch2_bkey_buf_init(&sk);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       while (!(ret = move_ratelimit(&trans, ctxt))) {
+               bch2_trans_begin(&trans);
+
+               ret = bch2_get_next_backpointer(&trans, bucket, gen,
+                                               &bp_offset, &bp,
+                                               BTREE_ITER_CACHED);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       goto err;
+               if (bp_offset == U64_MAX)
+                       break;
+
+               if (!bp.level) {
+                       const struct bch_extent_ptr *ptr;
+                       struct bkey_s_c k;
+                       unsigned i = 0;
+
+                       k = bch2_backpointer_get_key(&trans, &iter,
+                                               bucket, bp_offset, bp);
+                       ret = bkey_err(k);
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               continue;
+                       if (ret)
+                               goto err;
+                       if (!k.k)
+                               continue;
+
+                       bch2_bkey_buf_reassemble(&sk, c, k);
+                       k = bkey_i_to_s_c(sk.k);
+
+                       ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+                       if (ret) {
+                               bch2_trans_iter_exit(&trans, &iter);
+                               continue;
+                       }
+
+                       data_opts = _data_opts;
+                       data_opts.target        = io_opts.background_target;
+                       data_opts.rewrite_ptrs = 0;
+
+                       bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+                               if (ptr->dev == bucket.inode)
+                                       data_opts.rewrite_ptrs |= 1U << i;
+                               i++;
+                       }
+
+                       ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+                                              bp.btree_id, k, data_opts);
+                       bch2_trans_iter_exit(&trans, &iter);
+
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               continue;
+                       if (ret == -ENOMEM) {
+                               /* memory allocation failure, wait for some IO to finish */
+                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
+                               continue;
+                       }
+                       if (ret)
+                               goto err;
+
+                       if (ctxt->rate)
+                               bch2_ratelimit_increment(ctxt->rate, k.k->size);
+                       atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+               } else {
+                       struct btree *b;
+
+                       b = bch2_backpointer_get_node(&trans, &iter,
+                                               bucket, bp_offset, bp);
+                       ret = PTR_ERR_OR_ZERO(b);
+                       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+                               continue;
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               continue;
+                       if (ret)
+                               goto err;
+                       if (!b)
+                               continue;
+
+                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
+                       bch2_trans_iter_exit(&trans, &iter);
+
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               continue;
+                       if (ret)
+                               goto err;
+
+                       if (ctxt->rate)
+                               bch2_ratelimit_increment(ctxt->rate,
+                                                        c->opts.btree_node_size >> 9);
+                       atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+                       atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+               }
+
+               bp_offset++;
+       }
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
+               bch2_trans_unlock(&trans);
+               move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+               closure_sync(&ctxt->cl);
+               if (!ctxt->write_error)
+                       lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
+       }
+err:
+       bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&sk, c);
+       return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+                        struct bpos bucket, int gen,
+                        struct data_update_opts data_opts,
+                        struct bch_ratelimit *rate,
+                        struct bch_move_stats *stats,
+                        struct write_point_specifier wp,
+                        bool wait_on_copygc)
+{
+       struct moving_context ctxt;
+       int ret;
+
+       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+       ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+       bch2_moving_ctxt_exit(&ctxt);
 
-       progress_list_del(c, stats);
        return ret;
 }
 
-typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
-                                        struct btree *, struct bch_io_opts *,
-                                        struct data_opts *);
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+                               struct btree *, struct bch_io_opts *,
+                               struct data_update_opts *);
 
 static int bch2_move_btree(struct bch_fs *c,
                           enum btree_id start_btree_id, struct bpos start_pos,
@@ -880,8 +768,7 @@ static int bch2_move_btree(struct bch_fs *c,
        struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
-       struct data_opts data_opts;
-       enum data_cmd cmd;
+       struct data_update_opts data_opts;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -910,27 +797,18 @@ retry:
 
                        stats->pos = iter.pos;
 
-                       switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
-                       case DATA_SKIP:
+                       if (!pred(c, arg, b, &io_opts, &data_opts))
                                goto next;
-                       case DATA_SCRUB:
-                               BUG();
-                       case DATA_ADD_REPLICAS:
-                       case DATA_REWRITE:
-                               break;
-                       default:
-                               BUG();
-                       }
 
                        ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
-                       if (ret == -EINTR)
+                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret)
                                break;
 next:
                        bch2_btree_iter_next_node(&iter);
                }
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
                bch2_trans_iter_exit(&trans, &iter);
@@ -942,30 +820,18 @@ next:
        bch2_trans_exit(&trans);
 
        if (ret)
-               bch_err(c, "error %i in bch2_move_btree", ret);
+               bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        progress_list_del(c, stats);
        return ret;
 }
 
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
-                               struct bkey_s_c k,
-                               struct bch_io_opts *io_opts,
-                               struct data_opts *data_opts)
-{
-       return DATA_SCRUB;
-}
-#endif
-
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
-                                     struct bkey_s_c k,
-                                     struct bch_io_opts *io_opts,
-                                     struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+                            struct bkey_s_c k,
+                            struct bch_io_opts *io_opts,
+                            struct data_update_opts *data_opts)
 {
        unsigned nr_good = bch2_bkey_durability(c, k);
        unsigned replicas = bkey_is_btree_ptr(k.k)
@@ -973,43 +839,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
                : io_opts->data_replicas;
 
        if (!nr_good || nr_good >= replicas)
-               return DATA_SKIP;
+               return false;
 
        data_opts->target               = 0;
-       data_opts->nr_replicas          = 1;
+       data_opts->extra_replicas       = replicas - nr_good;
        data_opts->btree_insert_flags   = 0;
-       return DATA_ADD_REPLICAS;
+       return true;
 }
 
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
-                                 struct bkey_s_c k,
-                                 struct bch_io_opts *io_opts,
-                                 struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+                        struct bkey_s_c k,
+                        struct bch_io_opts *io_opts,
+                        struct data_update_opts *data_opts)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
        struct bch_ioctl_data *op = arg;
+       unsigned i = 0;
 
-       if (!bch2_bkey_has_device(k, op->migrate.dev))
-               return DATA_SKIP;
-
+       data_opts->rewrite_ptrs         = 0;
        data_opts->target               = 0;
-       data_opts->nr_replicas          = 1;
+       data_opts->extra_replicas       = 0;
        data_opts->btree_insert_flags   = 0;
-       data_opts->rewrite_dev          = op->migrate.dev;
-       return DATA_REWRITE;
+
+       bkey_for_each_ptr(ptrs, ptr) {
+               if (ptr->dev == op->migrate.dev)
+                       data_opts->rewrite_ptrs |= 1U << i;
+               i++;
+       }
+
+       return data_opts->rewrite_ptrs != 0;
 }
 
-static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
-                                           struct btree *b,
-                                           struct bch_io_opts *io_opts,
-                                           struct data_opts *data_opts)
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+                                  struct btree *b,
+                                  struct bch_io_opts *io_opts,
+                                  struct data_update_opts *data_opts)
 {
        return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
 
-static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
-                                       struct btree *b,
-                                       struct bch_io_opts *io_opts,
-                                       struct data_opts *data_opts)
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+                              struct btree *b,
+                              struct bch_io_opts *io_opts,
+                              struct data_update_opts *data_opts)
 {
        return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
@@ -1038,21 +911,21 @@ static bool bformat_needs_redo(struct bkey_format *f)
        return false;
 }
 
-static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
-                                           struct btree *b,
-                                           struct bch_io_opts *io_opts,
-                                           struct data_opts *data_opts)
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+                                  struct btree *b,
+                                  struct bch_io_opts *io_opts,
+                                  struct data_update_opts *data_opts)
 {
        if (b->version_ondisk != c->sb.version ||
            btree_node_need_rewrite(b) ||
            bformat_needs_redo(&b->format)) {
                data_opts->target               = 0;
-               data_opts->nr_replicas          = 1;
+               data_opts->extra_replicas       = 0;
                data_opts->btree_insert_flags   = 0;
-               return DATA_REWRITE;
+               return true;
        }
 
-       return DATA_SKIP;
+       return false;
 }
 
 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
@@ -1096,8 +969,11 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_move_data(c,
                                     op.start_btree,    op.start_pos,
                                     op.end_btree,      op.end_pos,
-                                    NULL, writepoint_hashed((unsigned long) current),
-                                    rereplicate_pred, c, stats) ?: ret;
+                                    NULL,
+                                    stats,
+                                    writepoint_hashed((unsigned long) current),
+                                    true,
+                                    rereplicate_pred, c) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
        case BCH_DATA_OP_MIGRATE:
@@ -1117,8 +993,11 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_move_data(c,
                                     op.start_btree,    op.start_pos,
                                     op.end_btree,      op.end_pos,
-                                    NULL, writepoint_hashed((unsigned long) current),
-                                    migrate_pred, &op, stats) ?: ret;
+                                    NULL,
+                                    stats,
+                                    writepoint_hashed((unsigned long) current),
+                                    true,
+                                    migrate_pred, &op) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
        case BCH_DATA_OP_REWRITE_OLD_NODES:
index 2a789a1158ca22e4e7efb1cd4298b4115da3216d..c0fec69bbb6a1d6becca6bf6bdffc8eb32febc7e 100644 (file)
@@ -4,53 +4,37 @@
 
 #include "btree_iter.h"
 #include "buckets.h"
-#include "io_types.h"
+#include "data_update.h"
 #include "move_types.h"
 
 struct bch_read_bio;
-struct moving_context;
 
-enum data_cmd {
-       DATA_SKIP,
-       DATA_SCRUB,
-       DATA_ADD_REPLICAS,
-       DATA_REWRITE,
-       DATA_PROMOTE,
-};
-
-struct data_opts {
-       u16             target;
-       u8              rewrite_dev;
-       u8              nr_replicas;
-       int             btree_insert_flags;
-};
+struct moving_context {
+       struct bch_fs           *c;
+       struct bch_ratelimit    *rate;
+       struct bch_move_stats   *stats;
+       struct write_point_specifier wp;
+       bool                    wait_on_copygc;
+       bool                    write_error;
 
-struct migrate_write {
-       enum btree_id           btree_id;
-       enum data_cmd           data_cmd;
-       struct data_opts        data_opts;
+       /* For waiting on outstanding reads and writes: */
+       struct closure          cl;
+       struct list_head        reads;
 
-       unsigned                nr_ptrs_reserved;
+       /* in flight sectors: */
+       atomic_t                read_sectors;
+       atomic_t                write_sectors;
 
-       struct moving_context   *ctxt;
-
-       /* what we read: */
-       struct bch_extent_ptr   ptr;
-       u64                     offset;
-
-       struct bch_write_op     op;
+       wait_queue_head_t       wait;
 };
 
-void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
-int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-                           struct write_point_specifier,
-                           struct bch_io_opts,
-                           enum data_cmd, struct data_opts,
-                           enum btree_id, struct bkey_s_c);
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+                            struct bch_io_opts *, struct data_update_opts *);
 
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
-                               struct bkey_s_c,
-                               struct bch_io_opts *, struct data_opts *);
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+                          struct bch_ratelimit *, struct bch_move_stats *,
+                          struct write_point_specifier, bool);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
@@ -58,10 +42,20 @@ int bch2_move_data(struct bch_fs *,
                   enum btree_id, struct bpos,
                   enum btree_id, struct bpos,
                   struct bch_ratelimit *,
+                  struct bch_move_stats *,
                   struct write_point_specifier,
-                  move_pred_fn, void *,
-                  struct bch_move_stats *);
-
+                  bool,
+                  move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct moving_context *,
+                          struct bpos, int,
+                          struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+                        struct data_update_opts,
+                        struct bch_ratelimit *,
+                        struct bch_move_stats *,
+                        struct write_point_specifier,
+                        bool);
 int bch2_data_job(struct bch_fs *,
                  struct bch_move_stats *,
                  struct bch_ioctl_data);
index c82ecff3efe2b198eb541616e2f13fd4e4f4564e..044eca879afced62016f63076c6fe47f39eba069 100644 (file)
@@ -13,6 +13,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "errcode.h"
 #include "error.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca)                                    \
-       ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
-{
-       const struct copygc_heap_entry *l = _l;
-       const struct copygc_heap_entry *r = _r;
-
-       return  cmp_int(l->dev,    r->dev) ?:
-               cmp_int(l->offset, r->offset);
-}
-
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-                                struct bkey_s_c k,
-                                struct bch_io_opts *io_opts,
-                                struct data_opts *data_opts)
-{
-       copygc_heap *h = &c->copygc_heap;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p = { 0 };
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-               struct copygc_heap_entry search = {
-                       .dev    = p.ptr.dev,
-                       .offset = p.ptr.offset,
-               };
-               ssize_t i;
-
-               if (p.ptr.cached)
-                       continue;
-
-               i = eytzinger0_find_le(h->data, h->used,
-                                      sizeof(h->data[0]),
-                                      bucket_offset_cmp, &search);
-#if 0
-               /* eytzinger search verify code: */
-               ssize_t j = -1, k;
-
-               for (k = 0; k < h->used; k++)
-                       if (h->data[k].offset <= ptr->offset &&
-                           (j < 0 || h->data[k].offset > h->data[j].offset))
-                               j = k;
-
-               BUG_ON(i != j);
-#endif
-               if (i >= 0 &&
-                   p.ptr.dev == h->data[i].dev &&
-                   p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
-                   p.ptr.gen == h->data[i].gen) {
-                       /*
-                        * We need to use the journal reserve here, because
-                        *  - journal reclaim depends on btree key cache
-                        *    flushing to make forward progress,
-                        *  - which has to make forward progress when the
-                        *    journal is pre-reservation full,
-                        *  - and depends on allocation - meaning allocator and
-                        *    copygc
-                        */
-
-                       data_opts->target               = io_opts->background_target;
-                       data_opts->nr_replicas          = 1;
-                       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE|
-                               BTREE_INSERT_JOURNAL_RESERVED;
-                       data_opts->rewrite_dev          = p.ptr.dev;
-
-                       if (p.has_ec)
-                               data_opts->nr_replicas += p.ec.redundancy;
-
-                       return DATA_REWRITE;
-               }
-       }
-
-       return DATA_SKIP;
-}
-
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
-       bool ret;
-
-       spin_lock(&ca->fs->freelist_lock);
-       ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-               ca->allocator_state != ALLOCATOR_running;
-       spin_unlock(&ca->fs->freelist_lock);
-
-       return ret;
-}
-
 static inline int fragmentation_cmp(copygc_heap *heap,
                                   struct copygc_heap_entry l,
                                   struct copygc_heap_entry r)
@@ -138,37 +38,46 @@ static inline int fragmentation_cmp(copygc_heap *heap,
        return cmp_int(l.fragmentation, r.fragmentation);
 }
 
-static int walk_buckets_to_copygc(struct bch_fs *c)
+static int find_buckets_to_copygc(struct bch_fs *c)
 {
        copygc_heap *h = &c->copygc_heap;
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
+       struct bch_alloc_v4 a;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
+       /*
+        * Find buckets with lowest sector counts, skipping completely
+        * empty buckets, by building a maxheap sorted by sector count,
+        * and repeatedly replacing the maximum element until all
+        * buckets have been visited.
+        */
+       h->used = 0;
+
        for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
                struct copygc_heap_entry e;
 
-               u = bch2_alloc_unpack(k);
+               bch2_alloc_to_v4(k, &a);
 
-               if (u.data_type != BCH_DATA_user ||
-                   u.dirty_sectors >= ca->mi.bucket_size ||
+               if ((a.data_type != BCH_DATA_btree &&
+                    a.data_type != BCH_DATA_user) ||
+                   a.dirty_sectors >= ca->mi.bucket_size ||
                    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
                        continue;
 
                e = (struct copygc_heap_entry) {
                        .dev            = iter.pos.inode,
-                       .gen            = u.gen,
-                       .replicas       = 1 + u.stripe_redundancy,
-                       .fragmentation  = u.dirty_sectors * (1U << 15)
-                               / ca->mi.bucket_size,
-                       .sectors        = u.dirty_sectors,
-                       .offset         = bucket_to_sector(ca, iter.pos.offset),
+                       .gen            = a.gen,
+                       .replicas       = 1 + a.stripe_redundancy,
+                       .fragmentation  = div_u64((u64) a.dirty_sectors * (1ULL << 31),
+                                                 ca->mi.bucket_size),
+                       .sectors        = a.dirty_sectors,
+                       .bucket         = iter.pos.offset,
                };
                heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
 
@@ -179,77 +88,22 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
        return ret;
 }
 
-static int bucket_inorder_cmp(const void *_l, const void *_r)
-{
-       const struct copygc_heap_entry *l = _l;
-       const struct copygc_heap_entry *r = _r;
-
-       return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
-}
-
-static int check_copygc_was_done(struct bch_fs *c,
-                                u64 *sectors_not_moved,
-                                u64 *buckets_not_moved)
-{
-       copygc_heap *h = &c->copygc_heap;
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
-       struct copygc_heap_entry *i;
-       int ret = 0;
-
-       sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
-
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
-
-       for (i = h->data; i < h->data + h->used; i++) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
-
-               bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
-
-               ret = lockrestart_do(&trans,
-                               bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-               if (ret)
-                       break;
-
-               u = bch2_alloc_unpack(k);
-
-               if (u.gen == i->gen && u.dirty_sectors) {
-                       *sectors_not_moved += u.dirty_sectors;
-                       *buckets_not_moved += 1;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-       return ret;
-}
-
 static int bch2_copygc(struct bch_fs *c)
 {
        copygc_heap *h = &c->copygc_heap;
-       struct copygc_heap_entry e, *i;
+       struct copygc_heap_entry e;
        struct bch_move_stats move_stats;
-       u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
-       u64 sectors_reserved = 0;
-       u64 buckets_to_move, buckets_not_moved = 0;
        struct bch_dev *ca;
        unsigned dev_idx;
        size_t heap_size = 0;
-       int ret;
+       struct moving_context ctxt;
+       struct data_update_opts data_opts = {
+               .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
+       };
+       int ret = 0;
 
        bch_move_stats_init(&move_stats, "copygc");
 
-       /*
-        * Find buckets with lowest sector counts, skipping completely
-        * empty buckets, by building a maxheap sorted by sector count,
-        * and repeatedly replacing the maximum element until all
-        * buckets have been visited.
-        */
-       h->used = 0;
-
        for_each_rw_member(ca, c, dev_idx)
                heap_size += ca->mi.nbuckets >> 7;
 
@@ -261,87 +115,58 @@ static int bch2_copygc(struct bch_fs *c)
                }
        }
 
-       for_each_rw_member(ca, c, dev_idx) {
-               closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
-               spin_lock(&ca->fs->freelist_lock);
-               sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
-               spin_unlock(&ca->fs->freelist_lock);
-       }
-
-       ret = walk_buckets_to_copygc(c);
+       ret = find_buckets_to_copygc(c);
        if (ret) {
                bch2_fs_fatal_error(c, "error walking buckets to copygc!");
                return ret;
        }
 
        if (!h->used) {
-               bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+               s64 wait = S64_MAX, dev_wait;
+               u64 dev_min_wait_fragmented = 0;
+               u64 dev_min_wait_allowed = 0;
+               int dev_min_wait = -1;
+
+               for_each_rw_member(ca, c, dev_idx) {
+                       struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+                       s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+                                              ca->mi.bucket_size) >> 1);
+                       s64 fragmented = usage.d[BCH_DATA_user].fragmented;
+
+                       dev_wait = max(0LL, allowed - fragmented);
+
+                       if (dev_min_wait < 0 || dev_wait < wait) {
+                               dev_min_wait = dev_idx;
+                               dev_min_wait_fragmented = fragmented;
+                               dev_min_wait_allowed    = allowed;
+                       }
+               }
+
+               bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
+                                   dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
                return 0;
        }
 
-       /*
-        * Our btree node allocations also come out of RESERVE_MOVINGGC:
-        */
-       sectors_reserved = (sectors_reserved * 3) / 4;
-       if (!sectors_reserved) {
-               bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
-               return -1;
-       }
+       heap_resort(h, fragmentation_cmp, NULL);
 
-       for (i = h->data; i < h->data + h->used; i++) {
-               sectors_to_move += i->sectors;
-               sectors_to_write += i->sectors * i->replicas;
-       }
+       bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+                             writepoint_ptr(&c->copygc_write_point),
+                             false);
 
-       while (sectors_to_write > sectors_reserved) {
+       /* not correct w.r.t. device removal */
+       while (h->used && !ret) {
                BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-               sectors_to_write -= e.sectors * e.replicas;
+               ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
+                                            data_opts);
        }
 
-       buckets_to_move = h->used;
-
-       if (!buckets_to_move) {
-               bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
-                                   sectors_reserved);
-               return 0;
-       }
-
-       eytzinger0_sort(h->data, h->used,
-                       sizeof(h->data[0]),
-                       bucket_offset_cmp, NULL);
-
-       ret = bch2_move_data(c,
-                            0,                 POS_MIN,
-                            BTREE_ID_NR,       POS_MAX,
-                            NULL,
-                            writepoint_ptr(&c->copygc_write_point),
-                            copygc_pred, NULL,
-                            &move_stats);
-       if (ret) {
-               bch_err(c, "error %i from bch2_move_data() in copygc", ret);
-               return ret;
-       }
+       bch2_moving_ctxt_exit(&ctxt);
 
-       ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
-       if (ret) {
-               bch_err(c, "error %i from check_copygc_was_done()", ret);
-               return ret;
-       }
+       if (ret < 0 && ret != -EROFS)
+               bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
-       if (sectors_not_moved)
-               bch_warn_ratelimited(c,
-                       "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
-                        sectors_not_moved, sectors_to_move,
-                        buckets_not_moved, buckets_to_move,
-                        atomic64_read(&move_stats.sectors_moved),
-                        atomic64_read(&move_stats.keys_raced),
-                        atomic64_read(&move_stats.sectors_raced));
-
-       trace_copygc(c,
-                    atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
-                    buckets_to_move, buckets_not_moved);
-       return 0;
+       trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
+       return ret;
 }
 
 /*
@@ -367,8 +192,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
        for_each_rw_member(ca, c, dev_idx) {
                struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-               fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
-                                       ca->mi.bucket_size) >> 1);
+               fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+                                      ca->mi.bucket_size) >> 1);
                fragmented = usage.d[BCH_DATA_user].fragmented;
 
                wait = min(wait, max(0LL, fragmented_allowed - fragmented));
@@ -382,10 +207,11 @@ static int bch2_copygc_thread(void *arg)
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
        u64 last, wait;
+       int ret = 0;
 
        set_freezable();
 
-       while (!kthread_should_stop()) {
+       while (!ret && !kthread_should_stop()) {
                cond_resched();
 
                if (kthread_wait_freezable(c->copy_gc_enabled))
@@ -395,7 +221,7 @@ static int bch2_copygc_thread(void *arg)
                wait = bch2_copygc_wait_amount(c);
 
                if (wait > clock->max_slop) {
-                       trace_copygc_wait(c, wait, last + wait);
+                       trace_and_count(c, copygc_wait, c, wait, last + wait);
                        c->copygc_wait = last + wait;
                        bch2_kthread_io_clock_wait(clock, last + wait,
                                        MAX_SCHEDULE_TIMEOUT);
@@ -404,8 +230,11 @@ static int bch2_copygc_thread(void *arg)
 
                c->copygc_wait = 0;
 
-               if (bch2_copygc(c))
-                       break;
+               c->copygc_running = true;
+               ret = bch2_copygc(c);
+               c->copygc_running = false;
+
+               wake_up(&c->copygc_running_wq);
        }
 
        return 0;
@@ -423,6 +252,7 @@ void bch2_copygc_stop(struct bch_fs *c)
 int bch2_copygc_start(struct bch_fs *c)
 {
        struct task_struct *t;
+       int ret;
 
        if (c->copygc_thread)
                return 0;
@@ -434,9 +264,10 @@ int bch2_copygc_start(struct bch_fs *c)
                return -ENOMEM;
 
        t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-       if (IS_ERR(t)) {
-               bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
-               return PTR_ERR(t);
+       ret = PTR_ERR_OR_ZERO(t);
+       if (ret) {
+               bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+               return ret;
        }
 
        get_task_struct(t);
@@ -449,4 +280,6 @@ int bch2_copygc_start(struct bch_fs *c)
 
 void bch2_fs_copygc_init(struct bch_fs *c)
 {
+       init_waitqueue_head(&c->copygc_running_wq);
+       c->copygc_running = false;
 }
index 922738247d03967b439d0ceb75ed451218f6d31d..e85c8136a46e9b7ad52e987c9d299c4d4c9cc581 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H
 
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
 void bch2_copygc_stop(struct bch_fs *);
 int bch2_copygc_start(struct bch_fs *);
 void bch2_fs_copygc_init(struct bch_fs *);
index 71bf26eb13d5dbcc767cced43f14f55a4cfa9b5c..407b221e8f6c9ced8d00e3c9db66c9d225abc3fc 100644 (file)
@@ -9,7 +9,14 @@
 #include "super-io.h"
 #include "util.h"
 
-#define x(t, n) #t,
+#include <linux/pretty-printers.h>
+
+#define x(t, n) [n] = #t,
+
+const char * const bch2_metadata_versions[] = {
+       BCH_METADATA_VERSIONS()
+       NULL
+};
 
 const char * const bch2_error_actions[] = {
        BCH_ERROR_ACTIONS()
@@ -28,6 +35,7 @@ const char * const bch2_sb_compat[] = {
 
 const char * const bch2_btree_ids[] = {
        BCH_BTREE_IDS()
+       "interior btree node",
        NULL
 };
 
@@ -96,6 +104,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
        [DT_SUBVOL]     = "subvol",
 };
 
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+       BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+       BUG();
+}
+
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
 #define x(_name, ...)                                          \
@@ -209,62 +227,74 @@ static int bch2_mount_opt_lookup(const char *name)
        return bch2_opt_lookup(name);
 }
 
-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
 {
        if (v < opt->min) {
-               if (msg)
-                       pr_err("invalid %s%s: too small (min %llu)",
-                              msg, opt->attr.name, opt->min);
+               if (err)
+                       prt_printf(err, "%s: too small (min %llu)",
+                              opt->attr.name, opt->min);
                return -ERANGE;
        }
 
        if (opt->max && v >= opt->max) {
-               if (msg)
-                       pr_err("invalid %s%s: too big (max %llu)",
-                              msg, opt->attr.name, opt->max);
+               if (err)
+                       prt_printf(err, "%s: too big (max %llu)",
+                              opt->attr.name, opt->max);
                return -ERANGE;
        }
 
        if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
-               if (msg)
-                       pr_err("invalid %s %s: not a multiple of 512",
-                              msg, opt->attr.name);
+               if (err)
+                       prt_printf(err, "%s: not a multiple of 512",
+                              opt->attr.name);
                return -EINVAL;
        }
 
        if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
-               if (msg)
-                       pr_err("invalid %s%s: must be a power of two",
-                              msg, opt->attr.name);
+               if (err)
+                       prt_printf(err, "%s: must be a power of two",
+                              opt->attr.name);
                return -EINVAL;
        }
 
        return 0;
 }
 
-int bch2_opt_parse(struct bch_fs *c, const char *msg,
+int bch2_opt_parse(struct bch_fs *c,
                   const struct bch_option *opt,
-                  const char *val, u64 *res)
+                  const char *val, u64 *res,
+                  struct printbuf *err)
 {
        ssize_t ret;
 
        switch (opt->type) {
        case BCH_OPT_BOOL:
                ret = kstrtou64(val, 10, res);
-               if (ret < 0)
+               if (ret < 0 || (*res != 0 && *res != 1)) {
+                       prt_printf(err, "%s: must be bool",
+                              opt->attr.name);
                        return ret;
+               }
                break;
        case BCH_OPT_UINT:
                ret = opt->flags & OPT_HUMAN_READABLE
                        ? bch2_strtou64_h(val, res)
                        : kstrtou64(val, 10, res);
-               if (ret < 0)
+               if (ret < 0) {
+                       if (err)
+                               prt_printf(err, "%s: must be a number",
+                                      opt->attr.name);
                        return ret;
+               }
                break;
        case BCH_OPT_STR:
                ret = match_string(opt->choices, -1, val);
-               if (ret < 0)
+               if (ret < 0) {
+                       if (err)
+                               prt_printf(err, "%s: invalid selection",
+                                      opt->attr.name);
                        return ret;
+               }
 
                *res = ret;
                break;
@@ -273,44 +303,49 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg,
                        return 0;
 
                ret = opt->parse(c, val, res);
-               if (ret < 0)
+               if (ret < 0) {
+                       if (err)
+                               prt_printf(err, "%s: parse error",
+                                      opt->attr.name);
                        return ret;
+               }
        }
 
-       return bch2_opt_validate(opt, msg, *res);
+       return bch2_opt_validate(opt, *res, err);
 }
 
-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_opt_to_text(struct printbuf *out,
+                     struct bch_fs *c, struct bch_sb *sb,
                      const struct bch_option *opt, u64 v,
                      unsigned flags)
 {
        if (flags & OPT_SHOW_MOUNT_STYLE) {
                if (opt->type == BCH_OPT_BOOL) {
-                       pr_buf(out, "%s%s",
+                       prt_printf(out, "%s%s",
                               v ? "" : "no",
                               opt->attr.name);
                        return;
                }
 
-               pr_buf(out, "%s=", opt->attr.name);
+               prt_printf(out, "%s=", opt->attr.name);
        }
 
        switch (opt->type) {
        case BCH_OPT_BOOL:
        case BCH_OPT_UINT:
                if (opt->flags & OPT_HUMAN_READABLE)
-                       bch2_hprint(out, v);
+                       prt_human_readable_u64(out, v);
                else
-                       pr_buf(out, "%lli", v);
+                       prt_printf(out, "%lli", v);
                break;
        case BCH_OPT_STR:
                if (flags & OPT_SHOW_FULL_LIST)
-                       bch2_string_opt_to_text(out, opt->choices, v);
+                       prt_string_option(out, opt->choices, v);
                else
-                       pr_buf(out, opt->choices[v]);
+                       prt_printf(out, "%s", opt->choices[v]);
                break;
        case BCH_OPT_FN:
-               opt->to_text(out, c, v);
+               opt->to_text(out, c, sb, v);
                break;
        default:
                BUG();
@@ -356,6 +391,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
        char *copied_opts, *copied_opts_start;
        char *opt, *name, *val;
        int ret, id;
+       struct printbuf err = PRINTBUF;
        u64 v;
 
        if (!options)
@@ -375,8 +411,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
                        if (id < 0)
                                goto bad_opt;
 
-                       ret = bch2_opt_parse(c, "mount option ",
-                                            &bch2_opt_table[id], val, &v);
+                       ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
                        if (ret < 0)
                                goto bad_val;
                } else {
@@ -419,7 +454,7 @@ bad_opt:
        ret = -1;
        goto out;
 bad_val:
-       pr_err("Invalid value %s for mount option %s", val, name);
+       pr_err("Invalid mount option %s", err.buf);
        ret = -1;
        goto out;
 no_val:
@@ -428,9 +463,26 @@ no_val:
        goto out;
 out:
        kfree(copied_opts_start);
+       printbuf_exit(&err);
        return ret;
 }
 
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+       const struct bch_option *opt = bch2_opt_table + id;
+       u64 v;
+
+       v = opt->get_sb(sb);
+
+       if (opt->flags & OPT_SB_FIELD_ILOG2)
+               v = 1ULL << v;
+
+       if (opt->flags & OPT_SB_FIELD_SECTORS)
+               v <<= 9;
+
+       return v;
+}
+
 /*
  * Initial options from superblock - here we don't want any options undefined,
  * any options the superblock doesn't specify are set to 0:
@@ -438,28 +490,14 @@ out:
 int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 {
        unsigned id;
-       int ret;
 
        for (id = 0; id < bch2_opts_nr; id++) {
                const struct bch_option *opt = bch2_opt_table + id;
-               u64 v;
 
-               if (opt->get_sb == NO_SB_OPT)
+               if (opt->get_sb == BCH2_NO_SB_OPT)
                        continue;
 
-               v = opt->get_sb(sb);
-
-               if (opt->flags & OPT_SB_FIELD_ILOG2)
-                       v = 1ULL << v;
-
-               if (opt->flags & OPT_SB_FIELD_SECTORS)
-                       v <<= 9;
-
-               ret = bch2_opt_validate(opt, "superblock option ", v);
-               if (ret)
-                       return ret;
-
-               bch2_opt_set_by_id(opts, id, v);
+               bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
        }
 
        return 0;
@@ -467,7 +505,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 
 void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
 {
-       if (opt->set_sb == SET_NO_SB_OPT)
+       if (opt->set_sb == SET_BCH2_NO_SB_OPT)
                return;
 
        if (opt->flags & OPT_SB_FIELD_SECTORS)
@@ -481,7 +519,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
 
 void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
 {
-       if (opt->set_sb == SET_NO_SB_OPT)
+       if (opt->set_sb == SET_BCH2_NO_SB_OPT)
                return;
 
        mutex_lock(&c->sb_lock);
index affe9233d708094c0ab032b170bda23f3deba763..5b8586ecb37431150a4321a73876f52691f8e768 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/sysfs.h>
 #include "bcachefs_format.h"
 
+extern const char * const bch2_metadata_versions[];
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
@@ -42,7 +43,8 @@ static inline const char *bch2_d_type_str(unsigned d_type)
  */
 
 /* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT,                struct bch_sb, flags[0], 0, 0);
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
 
 /* When can be set: */
 enum opt_flags {
@@ -163,22 +165,22 @@ enum opt_type {
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_METADATA_TARGET,       0,                              \
-         "(target)",   "Device or disk group for metadata writes")     \
+         "(target)",   "Device or label for metadata writes")          \
        x(foreground_target,            u16,                            \
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_FOREGROUND_TARGET,     0,                              \
-         "(target)",   "Device or disk group for foreground writes")   \
+         "(target)",   "Device or label for foreground writes")        \
        x(background_target,            u16,                            \
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_BACKGROUND_TARGET,     0,                              \
-         "(target)",   "Device or disk group to move data to in the background")\
+         "(target)",   "Device or label to move data to in the background")\
        x(promote_target,               u16,                            \
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_PROMOTE_TARGET,        0,                              \
-         "(target)",   "Device or disk group to promote data to on read")\
+         "(target)",   "Device or label to promote data to on read")   \
        x(erasure_code,                 u16,                            \
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_BOOL(),                                                   \
@@ -202,7 +204,7 @@ enum opt_type {
        x(btree_node_mem_ptr_optimization, u8,                          \
          OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    true,                           \
+         BCH2_NO_SB_OPT,               true,                           \
          NULL,         "Stash pointer to in memory btree node in btree ptr")\
        x(gc_reserve_percent,           u8,                             \
          OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
@@ -229,7 +231,7 @@ enum opt_type {
        x(inline_data,                  u8,                             \
          OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    true,                           \
+         BCH2_NO_SB_OPT,               true,                           \
          NULL,         "Enable inline data extents")                   \
        x(acl,                          u8,                             \
          OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
@@ -254,26 +256,26 @@ enum opt_type {
        x(degraded,                     u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Allow mounting in degraded mode")              \
        x(very_degraded,                u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Allow mounting in when data will be missing")  \
        x(discard,                      u8,                             \
          OPT_FS|OPT_MOUNT|OPT_DEVICE,                                  \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               true,                           \
          NULL,         "Enable discard/TRIM support")                  \
        x(verbose,                      u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Extra debugging information during mount/recovery")\
        x(journal_flush_delay,          u32,                            \
          OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_UINT(0, U32_MAX),                                         \
+         OPT_UINT(1, U32_MAX),                                         \
          BCH_SB_JOURNAL_FLUSH_DELAY,   1000,                           \
          NULL,         "Delay in milliseconds before automatic journal commits")\
        x(journal_flush_disabled,       u8,                             \
@@ -288,107 +290,112 @@ enum opt_type {
          OPT_UINT(0, U32_MAX),                                         \
          BCH_SB_JOURNAL_RECLAIM_DELAY, 100,                            \
          NULL,         "Delay in milliseconds before automatic journal reclaim")\
+       x(move_bytes_in_flight,         u32,                            \
+         OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,              \
+         OPT_UINT(1024, U32_MAX),                                      \
+         BCH2_NO_SB_OPT,               1U << 20,                       \
+         NULL,         "Amount of IO in flight to keep in flight by the move path")\
        x(fsck,                         u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Run fsck on mount")                            \
        x(fix_errors,                   u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Fix errors during fsck without asking")        \
        x(ratelimit_errors,             u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    RATELIMIT_ERRORS_DEFAULT,       \
+         BCH2_NO_SB_OPT,               RATELIMIT_ERRORS_DEFAULT,       \
          NULL,         "Ratelimit error messages during fsck")         \
        x(nochanges,                    u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Super read only mode - no writes at all will be issued,\n"\
                        "even if we have to replay the journal")        \
        x(norecovery,                   u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Don't replay the journal")                     \
-       x(rebuild_replicas,             u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
-         NULL,         "Rebuild the superblock replicas section")      \
        x(keep_journal,                 u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Don't free journal entries/keys after startup")\
        x(read_entire_journal,          u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Read all journal entries, not just dirty ones")\
-       x(journal_transaction_names,    u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+       x(read_journal_only,            u8,                             \
+         0,                                                            \
          OPT_BOOL(),                                                   \
-         BCH_SB_JOURNAL_TRANSACTION_NAMES, true,                       \
-         NULL,         "Log transaction function names in journal")    \
+         BCH2_NO_SB_OPT,                       false,                          \
+         NULL,         "Only read the journal, skip the rest of recovery")\
        x(noexcl,                       u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Don't open device in exclusive mode")          \
+       x(direct_io,                    u8,                             \
+         OPT_FS|OPT_MOUNT,                                             \
+         OPT_BOOL(),                                                   \
+         BCH2_NO_SB_OPT,                       true,                   \
+         NULL,         "Use O_DIRECT (userspace only)")                \
        x(sb,                           u64,                            \
          OPT_MOUNT,                                                    \
          OPT_UINT(0, S64_MAX),                                         \
-         NO_SB_OPT,                    BCH_SB_SECTOR,                  \
+         BCH2_NO_SB_OPT,                       BCH_SB_SECTOR,                  \
          "offset",     "Sector offset of superblock")                  \
        x(read_only,                    u8,                             \
          OPT_FS,                                                       \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         NULL)                                           \
        x(nostart,                      u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Don\'t start filesystem, only open devices")   \
        x(reconstruct_alloc,            u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Reconstruct alloc btree")                      \
        x(version_upgrade,              u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Set superblock to latest version,\n"           \
                        "allowing any new features to be used")         \
        x(buckets_nouse,                u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Allocate the buckets_nouse bitmap")            \
        x(project,                      u8,                             \
          OPT_INODE,                                                    \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH2_NO_SB_OPT,                       false,                          \
          NULL,         NULL)                                           \
        x(fs_size,                      u64,                            \
          OPT_DEVICE,                                                   \
          OPT_UINT(0, S64_MAX),                                         \
-         NO_SB_OPT,                    0,                              \
+         BCH2_NO_SB_OPT,               0,                              \
          "size",       "Size of filesystem on device")                 \
        x(bucket,                       u32,                            \
          OPT_DEVICE,                                                   \
          OPT_UINT(0, S64_MAX),                                         \
-         NO_SB_OPT,                    0,                              \
+         BCH2_NO_SB_OPT,               0,                              \
          "size",       "Size of filesystem on device")                 \
        x(durability,                   u8,                             \
          OPT_DEVICE,                                                   \
          OPT_UINT(0, BCH_REPLICAS_MAX),                                \
-         NO_SB_OPT,                    1,                              \
+         BCH2_NO_SB_OPT,               1,                              \
          "n",          "Data written to this device will be considered\n"\
                        "to have already been replicated n times")
 
@@ -447,17 +454,9 @@ struct bch_option {
        enum opt_flags          flags;
        u64                     min, max;
 
-       union {
-       struct {
-       };
-       struct {
-               const char * const *choices;
-       };
-       struct {
-               int (*parse)(struct bch_fs *, const char *, u64 *);
-               void (*to_text)(struct printbuf *, struct bch_fs *, u64);
-       };
-       };
+       const char * const *choices;
+       int (*parse)(struct bch_fs *, const char *, u64 *);
+       void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 
        const char              *hint;
        const char              *help;
@@ -470,18 +469,20 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
 int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
 void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
 void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
-                  const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+                  const char *, u64 *, struct printbuf *);
 
 #define OPT_SHOW_FULL_LIST     (1 << 0)
 #define OPT_SHOW_MOUNT_STYLE   (1 << 1)
 
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
                      const struct bch_option *, u64, unsigned);
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
index 6fb8224f565e3a00a2960a5dde41f2182a0dbe5a..db817273652771e73943a22629122313df9cd79b 100644 (file)
@@ -1,44 +1,81 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "inode.h"
 #include "quota.h"
 #include "subvolume.h"
 #include "super-io.h"
 
-static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+static const char * const bch2_quota_types[] = {
+       "user",
+       "group",
+       "project",
+};
+
+static const char * const bch2_quota_counters[] = {
+       "space",
+       "inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
                                  struct printbuf *err)
 {
        struct bch_sb_field_quota *q = field_to_type(f, quota);
 
        if (vstruct_bytes(&q->field) < sizeof(*q)) {
-               pr_buf(err, "wrong size (got %llu should be %zu)",
+               prt_printf(err, "wrong size (got %zu should be %zu)",
                       vstruct_bytes(&q->field), sizeof(*q));
+               return -EINVAL;
        }
 
        return 0;
 }
 
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_quota *q = field_to_type(f, quota);
+       unsigned qtyp, counter;
+
+       for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+               prt_printf(out, "%s: flags %llx",
+                      bch2_quota_types[qtyp],
+                      le64_to_cpu(q->q[qtyp].flags));
+
+               for (counter = 0; counter < Q_COUNTERS; counter++)
+                       prt_printf(out, " %s timelimit %u warnlimit %u",
+                              bch2_quota_counters[counter],
+                              le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+                              le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+               prt_newline(out);
+       }
+}
+
 const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-       .validate       = bch2_sb_validate_quota,
+       .validate       = bch2_sb_quota_validate,
+       .to_text        = bch2_sb_quota_to_text,
 };
 
-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                      int rw, struct printbuf *err)
 {
-       if (k.k->p.inode >= QTYP_NR)
-               return "invalid quota type";
+       if (k.k->p.inode >= QTYP_NR) {
+               prt_printf(err, "invalid quota type (%llu >= %u)",
+                      k.k->p.inode, QTYP_NR);
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
-               return "incorrect value size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_quota));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
-static const char * const bch2_quota_counters[] = {
-       "space",
-       "inodes",
-};
-
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
                        struct bkey_s_c k)
 {
@@ -46,7 +83,7 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
        unsigned i;
 
        for (i = 0; i < Q_COUNTERS; i++)
-               pr_buf(out, "%s hardlimit %llu softlimit %llu",
+               prt_printf(out, "%s hardlimit %llu softlimit %llu",
                       bch2_quota_counters[i],
                       le64_to_cpu(dq.v->c[i].hardlimit),
                       le64_to_cpu(dq.v->c[i].softlimit));
@@ -58,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 #include <linux/fs.h>
 #include <linux/quota.h>
 
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 20);
+
+       prt_str(out, "i_fieldmask");
+       prt_tab(out);
+       prt_printf(out, "%x", i->i_fieldmask);
+       prt_newline(out);
+
+       prt_str(out, "i_flags");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_flags);
+       prt_newline(out);
+
+       prt_str(out, "i_spc_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_spc_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_ino_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_ino_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_rt_spc_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_rt_spc_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_spc_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_spc_warnlimit);
+       prt_newline(out);
+
+       prt_str(out, "i_ino_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_ino_warnlimit);
+       prt_newline(out);
+
+       prt_str(out, "i_rt_spc_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+       prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 20);
+
+       prt_str(out, "d_fieldmask");
+       prt_tab(out);
+       prt_printf(out, "%x", q->d_fieldmask);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_hardlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_hardlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_softlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_softlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_hardlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_hardlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_softlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_softlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_space");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_space);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_count");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_count);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_timer");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_timer);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_timer");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_timer);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_warns");
+       prt_tab(out);
+       prt_printf(out, "%i", q->d_ino_warns);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_warns");
+       prt_tab(out);
+       prt_printf(out, "%i", q->d_spc_warns);
+       prt_newline(out);
+}
+
 static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
 {
        qtypes >>= i;
@@ -188,34 +332,20 @@ static int bch2_quota_check_limit(struct bch_fs *c,
        if (qc->hardlimit &&
            qc->hardlimit < n &&
            !ignore_hardlimit(q)) {
-               if (mode == KEY_TYPE_QUOTA_PREALLOC)
-                       return -EDQUOT;
-
                prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+               return -EDQUOT;
        }
 
        if (qc->softlimit &&
-           qc->softlimit < n &&
-           qc->timer &&
-           ktime_get_real_seconds() >= qc->timer &&
-           !ignore_hardlimit(q)) {
-               if (mode == KEY_TYPE_QUOTA_PREALLOC)
-                       return -EDQUOT;
-
-               prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-       }
-
-       if (qc->softlimit &&
-           qc->softlimit < n &&
-           qc->timer == 0) {
-               if (mode == KEY_TYPE_QUOTA_PREALLOC)
+           qc->softlimit < n) {
+               if (qc->timer == 0) {
+                       qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+                       prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+               } else if (ktime_get_real_seconds() >= qc->timer &&
+                          !ignore_hardlimit(q)) {
+                       prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
                        return -EDQUOT;
-
-               prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-
-               /* XXX is this the right one? */
-               qc->timer = ktime_get_real_seconds() +
-                       q->limits[counter].warnlimit;
+               }
        }
 
        return 0;
@@ -325,7 +455,8 @@ err:
        return ret;
 }
 
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+                           struct qc_dqblk *qdq)
 {
        struct bkey_s_c_quota dq;
        struct bch_memquota_type *q;
@@ -334,6 +465,9 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 
        BUG_ON(k.k->p.inode >= QTYP_NR);
 
+       if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+               return 0;
+
        switch (k.k->type) {
        case KEY_TYPE_quota:
                dq = bkey_s_c_to_quota(k);
@@ -351,36 +485,21 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
                        mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
                }
 
+               if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+                       mq->c[Q_SPC].timer      = cpu_to_le64(qdq->d_spc_timer);
+               if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+                       mq->c[Q_SPC].warns      = cpu_to_le64(qdq->d_spc_warns);
+               if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+                       mq->c[Q_INO].timer      = cpu_to_le64(qdq->d_ino_timer);
+               if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+                       mq->c[Q_INO].warns      = cpu_to_le64(qdq->d_ino_warns);
+
                mutex_unlock(&q->lock);
        }
 
        return 0;
 }
 
-static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
-                          BTREE_ITER_PREFETCH, k, ret) {
-               if (k.k->p.inode != type)
-                       break;
-
-               ret = __bch2_quota_set(c, k);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-       return ret;
-}
-
 void bch2_fs_quota_exit(struct bch_fs *c)
 {
        unsigned i;
@@ -397,6 +516,26 @@ void bch2_fs_quota_init(struct bch_fs *c)
                mutex_init(&c->quotas[i].lock);
 }
 
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+       struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+
+       if (sb_quota)
+               return sb_quota;
+
+       sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+       if (sb_quota) {
+               unsigned qtype, qc;
+
+               for (qtype = 0; qtype < QTYP_NR; qtype++)
+                       for (qc = 0; qc < Q_COUNTERS; qc++)
+                               sb_quota->q[qtype].c[qc].timelimit =
+                                       cpu_to_le32(7 * 24 * 60 * 60);
+       }
+
+       return sb_quota;
+}
+
 static void bch2_sb_quota_read(struct bch_fs *c)
 {
        struct bch_sb_field_quota *sb_quota;
@@ -419,22 +558,14 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 }
 
 static int bch2_fs_quota_read_inode(struct btree_trans *trans,
-                                   struct btree_iter *iter)
+                                   struct btree_iter *iter,
+                                   struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        struct bch_inode_unpacked u;
        struct bch_subvolume subvolume;
-       struct bkey_s_c k;
        int ret;
 
-       k = bch2_btree_iter_peek(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (!k.k)
-               return 1;
-
        ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
        if (ret)
                return ret;
@@ -463,36 +594,35 @@ advance:
 
 int bch2_fs_quota_read(struct bch_fs *c)
 {
-       unsigned i, qtypes = enabled_qtypes(c);
-       struct bch_memquota_type *q;
+       struct bch_sb_field_quota *sb_quota;
        struct btree_trans trans;
        struct btree_iter iter;
+       struct bkey_s_c k;
        int ret;
 
        mutex_lock(&c->sb_lock);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+       if (!sb_quota) {
+               mutex_unlock(&c->sb_lock);
+               return -BCH_ERR_ENOSPC_sb_quota;
+       }
+
        bch2_sb_quota_read(c);
        mutex_unlock(&c->sb_lock);
 
-       for_each_set_qtype(c, i, q, qtypes) {
-               ret = bch2_quota_init_type(c, i);
-               if (ret)
-                       return ret;
-       }
-
        bch2_trans_init(&trans, c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       do {
-               ret = lockrestart_do(&trans,
-                                    bch2_fs_quota_read_inode(&trans, &iter));
-       } while (!ret);
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+                       POS_MIN, BTREE_ITER_PREFETCH, k,
+               __bch2_quota_set(c, k, NULL)) ?:
+             for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+                       POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+               bch2_fs_quota_read_inode(&trans, &iter, k));
+       if (ret)
+               bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
 
        bch2_trans_exit(&trans);
-       return ret < 0 ? ret : 0;
+       return ret;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
@@ -500,6 +630,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 static int bch2_quota_enable(struct super_block        *sb, unsigned uflags)
 {
        struct bch_fs *c = sb->s_fs_info;
+       struct bch_sb_field_quota *sb_quota;
+       int ret = 0;
 
        if (sb->s_flags & SB_RDONLY)
                return -EROFS;
@@ -519,6 +651,12 @@ static int bch2_quota_enable(struct super_block    *sb, unsigned uflags)
                return -EINVAL;
 
        mutex_lock(&c->sb_lock);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+       if (!sb_quota) {
+               ret = -BCH_ERR_ENOSPC_sb_quota;
+               goto unlock;
+       }
+
        if (uflags & FS_QUOTA_UDQ_ENFD)
                SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
 
@@ -529,9 +667,10 @@ static int bch2_quota_enable(struct super_block    *sb, unsigned uflags)
                SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
 
        bch2_write_super(c);
+unlock:
        mutex_unlock(&c->sb_lock);
 
-       return 0;
+       return bch2_err_class(ret);
 }
 
 static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
@@ -643,6 +782,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
        struct bch_fs *c = sb->s_fs_info;
        struct bch_sb_field_quota *sb_quota;
        struct bch_memquota_type *q;
+       int ret = 0;
+
+       if (0) {
+               struct printbuf buf = PRINTBUF;
+
+               qc_info_to_text(&buf, info);
+               pr_info("setting:\n%s", buf.buf);
+               printbuf_exit(&buf);
+       }
 
        if (sb->s_flags & SB_RDONLY)
                return -EROFS;
@@ -660,12 +808,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
        q = &c->quotas[type];
 
        mutex_lock(&c->sb_lock);
-       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
        if (!sb_quota) {
-               sb_quota = bch2_sb_resize_quota(&c->disk_sb,
-                                       sizeof(*sb_quota) / sizeof(u64));
-               if (!sb_quota)
-                       return -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_sb_quota;
+               goto unlock;
        }
 
        if (info->i_fieldmask & QC_SPC_TIMER)
@@ -687,9 +833,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
        bch2_sb_quota_read(c);
 
        bch2_write_super(c);
+unlock:
        mutex_unlock(&c->sb_lock);
 
-       return 0;
+       return bch2_err_class(ret);
 }
 
 /* Get/set individual quotas: */
@@ -794,6 +941,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
        struct bkey_i_quota new_quota;
        int ret;
 
+       if (0) {
+               struct printbuf buf = PRINTBUF;
+
+               qc_dqblk_to_text(&buf, qdq);
+               pr_info("setting:\n%s", buf.buf);
+               printbuf_exit(&buf);
+       }
+
        if (sb->s_flags & SB_RDONLY)
                return -EROFS;
 
@@ -802,7 +957,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
                            bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
-               __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+               __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
        return ret;
 }
index 51e4f9713ef0bd7904b7aea90ee72dcafcaf5ad9..8c67ae1da7c75806fff2ee4a22182bdd704799aa 100644 (file)
@@ -7,7 +7,7 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota (struct bkey_ops) {                \
index a573fede05b11fba7a5ada92b9bbfae322608612..17b289b051f290bbce8fb6053ce20d07375f0218 100644 (file)
@@ -6,6 +6,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "errcode.h"
 #include "extents.h"
 #include "io.h"
 #include "move.h"
  * returns -1 if it should not be moved, or
  * device of pointer that should be moved, if known, or INT_MAX if unknown
  */
-static int __bch2_rebalance_pred(struct bch_fs *c,
-                                struct bkey_s_c k,
-                                struct bch_io_opts *io_opts)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+                          struct bkey_s_c k,
+                          struct bch_io_opts *io_opts,
+                          struct data_update_opts *data_opts)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
+       unsigned i;
+
+       data_opts->rewrite_ptrs         = 0;
+       data_opts->target               = io_opts->background_target;
+       data_opts->extra_replicas       = 0;
+       data_opts->btree_insert_flags   = 0;
 
        if (io_opts->background_compression &&
-           !bch2_bkey_is_incompressible(k))
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+           !bch2_bkey_is_incompressible(k)) {
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+
+               i = 0;
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                        if (!p.ptr.cached &&
                            p.crc.compression_type !=
                            bch2_compression_opt_to_type[io_opts->background_compression])
-                               return p.ptr.dev;
+                               data_opts->rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
 
-       if (io_opts->background_target)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (!p.ptr.cached &&
-                           !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
-                               return p.ptr.dev;
+       if (io_opts->background_target) {
+               const struct bch_extent_ptr *ptr;
+
+               i = 0;
+               bkey_for_each_ptr(ptrs, ptr) {
+                       if (!ptr->cached &&
+                           !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+                               data_opts->rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
 
-       return -1;
+       return data_opts->rewrite_ptrs != 0;
 }
 
 void bch2_rebalance_add_key(struct bch_fs *c,
                            struct bkey_s_c k,
                            struct bch_io_opts *io_opts)
 {
-       atomic64_t *counter;
-       int dev;
+       struct data_update_opts update_opts = { 0 };
+       struct bkey_ptrs_c ptrs;
+       const struct bch_extent_ptr *ptr;
+       unsigned i;
 
-       dev = __bch2_rebalance_pred(c, k, io_opts);
-       if (dev < 0)
+       if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
                return;
 
-       counter = dev < INT_MAX
-               ? &bch_dev_bkey_exists(c, dev)->rebalance_work
-               : &c->rebalance.work_unknown_dev;
-
-       if (atomic64_add_return(k.k->size, counter) == k.k->size)
-               rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-                                   struct bkey_s_c k,
-                                   struct bch_io_opts *io_opts,
-                                   struct data_opts *data_opts)
-{
-       if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
-               data_opts->target               = io_opts->background_target;
-               data_opts->nr_replicas          = 1;
-               data_opts->btree_insert_flags   = 0;
-               return DATA_ADD_REPLICAS;
-       } else {
-               return DATA_SKIP;
+       i = 0;
+       ptrs = bch2_bkey_ptrs_c(k);
+       bkey_for_each_ptr(ptrs, ptr) {
+               if ((1U << i) && update_opts.rewrite_ptrs)
+                       if (atomic64_add_return(k.k->size,
+                                       &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+                           k.k->size)
+                               rebalance_wakeup(c);
+               i++;
        }
 }
 
@@ -245,9 +254,10 @@ static int bch2_rebalance_thread(void *arg)
                               BTREE_ID_NR,     POS_MAX,
                               /* ratelimiting disabled for now */
                               NULL, /*  &r->pd.rate, */
+                              &move_stats,
                               writepoint_ptr(&c->rebalance_write_point),
-                              rebalance_pred, NULL,
-                              &move_stats);
+                              true,
+                              rebalance_pred, NULL);
        }
 
        return 0;
@@ -257,35 +267,48 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct bch_fs_rebalance *r = &c->rebalance;
        struct rebalance_work w = rebalance_work(c);
-       char h1[21], h2[21];
 
-       bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
-       bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
-       pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
-              w.dev_most_full_idx, h1, h2);
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 20);
+
+       prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+       prt_tab(out);
+
+       prt_human_readable_u64(out, w.dev_most_full_work << 9);
+       prt_printf(out, "/");
+       prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
+       prt_newline(out);
+
+       prt_printf(out, "total work:");
+       prt_tab(out);
 
-       bch2_hprint(&PBUF(h1), w.total_work << 9);
-       bch2_hprint(&PBUF(h2), c->capacity << 9);
-       pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
+       prt_human_readable_u64(out, w.total_work << 9);
+       prt_printf(out, "/");
+       prt_human_readable_u64(out, c->capacity << 9);
+       prt_newline(out);
 
-       pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+       prt_printf(out, "rate:");
+       prt_tab(out);
+       prt_printf(out, "%u", r->pd.rate.rate);
+       prt_newline(out);
 
        switch (r->state) {
        case REBALANCE_WAITING:
-               pr_buf(out, "waiting\n");
+               prt_printf(out, "waiting");
                break;
        case REBALANCE_THROTTLED:
-               bch2_hprint(&PBUF(h1),
+               prt_printf(out, "throttled for %lu sec or ",
+                      (r->throttled_until_cputime - jiffies) / HZ);
+               prt_human_readable_u64(out,
                            (r->throttled_until_iotime -
                             atomic64_read(&c->io_clock[WRITE].now)) << 9);
-               pr_buf(out, "throttled for %lu sec or %s io\n",
-                      (r->throttled_until_cputime - jiffies) / HZ,
-                      h1);
+               prt_printf(out, " io");
                break;
        case REBALANCE_RUNNING:
-               pr_buf(out, "running\n");
+               prt_printf(out, "running");
                break;
        }
+       prt_newline(out);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
@@ -310,6 +333,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
 int bch2_rebalance_start(struct bch_fs *c)
 {
        struct task_struct *p;
+       int ret;
 
        if (c->rebalance.thread)
                return 0;
@@ -318,9 +342,10 @@ int bch2_rebalance_start(struct bch_fs *c)
                return 0;
 
        p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-       if (IS_ERR(p)) {
-               bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
-               return PTR_ERR(p);
+       ret = PTR_ERR_OR_ZERO(p);
+       if (ret) {
+               bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+               return ret;
        }
 
        get_task_struct(p);
index 543db58ff4d6a087b57aae732da116ccd993998f..ea7810a1797500c826ce1b40b162d7725e15d2b9 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "dirent.h"
 #include "ec.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs-common.h"
 #include "fsck.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "lru.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
@@ -71,40 +74,119 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key
        return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-size_t bch2_journal_key_search(struct journal_keys *journal_keys,
-                              enum btree_id id, unsigned level,
-                              struct bpos pos)
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
 {
-       size_t l = 0, r = journal_keys->nr, m;
+       size_t gap_size = keys->size - keys->nr;
+
+       if (idx >= keys->gap)
+               idx += gap_size;
+       return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+       return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+                                       enum btree_id id, unsigned level,
+                                       struct bpos pos)
+{
+       size_t l = 0, r = keys->nr, m;
 
        while (l < r) {
                m = l + ((r - l) >> 1);
-               if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
+               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
                        l = m + 1;
                else
                        r = m;
        }
 
-       BUG_ON(l < journal_keys->nr &&
-              __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
+       BUG_ON(l < keys->nr &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
 
        BUG_ON(l &&
-              __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
 
        return l;
 }
 
-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+                                     enum btree_id id, unsigned level,
+                                     struct bpos pos)
 {
-       struct bkey_i *n = iter->keys->d[idx].k;
-       struct btree_and_journal_iter *biter =
-               container_of(iter, struct btree_and_journal_iter, journal);
-
-       if (iter->idx > idx ||
-           (iter->idx == idx &&
-            biter->last &&
-            bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
-               iter->idx++;
+       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos,
+                                          struct bpos end_pos, size_t *idx)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       unsigned iters = 0;
+       struct journal_key *k;
+search:
+       if (!*idx)
+               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+       while (*idx < keys->nr &&
+              (k = idx_to_key(keys, *idx),
+               k->btree_id == btree_id &&
+               k->level == level &&
+               bpos_cmp(k->k->k.p, end_pos) <= 0)) {
+               if (bpos_cmp(k->k->k.p, pos) >= 0 &&
+                   !k->overwritten)
+                       return k->k;
+
+               (*idx)++;
+               iters++;
+               if (iters == 10) {
+                       *idx = 0;
+                       goto search;
+               }
+       }
+
+       return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos)
+{
+       size_t idx = 0;
+
+       return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       /* The key we just inserted is immediately before the gap: */
+       size_t gap_end = keys->gap + (keys->size - keys->nr);
+       struct btree_and_journal_iter *iter;
+
+       /*
+        * If an iterator points one after the key we just inserted, decrement
+        * the iterator so it points at the key we just inserted - if the
+        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+        * handle that:
+        */
+       list_for_each_entry(iter, &c->journal_iters, journal.list)
+               if (iter->journal.idx == gap_end)
+                       iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       size_t gap_size = keys->size - keys->nr;
+
+       list_for_each_entry(iter, &c->journal_iters, list) {
+               if (iter->idx > old_gap)
+                       iter->idx -= gap_size;
+               if (iter->idx >= new_gap)
+                       iter->idx += gap_size;
+       }
 }
 
 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
@@ -122,12 +204,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                .journal_seq    = U32_MAX,
        };
        struct journal_keys *keys = &c->journal_keys;
-       struct journal_iter *iter;
        size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
 
        BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
-       if (idx < keys->nr &&
+       if (idx < keys->size &&
            journal_key_cmp(&n, &keys->d[idx]) == 0) {
                if (keys->d[idx].allocated)
                        kfree(keys->d[idx].k);
@@ -135,29 +216,40 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                return 0;
        }
 
+       if (idx > keys->gap)
+               idx -= keys->size - keys->nr;
+
        if (keys->nr == keys->size) {
                struct journal_keys new_keys = {
                        .nr                     = keys->nr,
-                       .size                   = keys->size * 2,
-                       .journal_seq_base       = keys->journal_seq_base,
+                       .size                   = max_t(size_t, keys->size, 8) * 2,
                };
 
-               new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+               new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
                if (!new_keys.d) {
                        bch_err(c, "%s: error allocating new key array (size %zu)",
                                __func__, new_keys.size);
                        return -ENOMEM;
                }
 
+               /* Since @keys was full, there was no gap: */
                memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
                kvfree(keys->d);
                *keys = new_keys;
+
+               /* And now the gap is at the end: */
+               keys->gap = keys->nr;
        }
 
-       array_insert_item(keys->d, keys->nr, idx, n);
+       journal_iters_move_gap(c, keys->gap, idx);
 
-       list_for_each_entry(iter, &c->journal_iters, list)
-               journal_iter_fix(c, iter, idx);
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+       keys->gap = idx;
+
+       keys->nr++;
+       keys->d[keys->gap++] = n;
+
+       journal_iters_fix(c);
 
        return 0;
 }
@@ -201,34 +293,37 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
        struct journal_keys *keys = &c->journal_keys;
        size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
-       if (idx < keys->nr &&
+       if (idx < keys->size &&
            keys->d[idx].btree_id       == btree &&
            keys->d[idx].level          == level &&
            !bpos_cmp(keys->d[idx].k->k.p, pos))
                keys->d[idx].overwritten = true;
 }
 
-static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->idx < iter->keys->size) {
+               iter->idx++;
+               if (iter->idx == iter->keys->gap)
+                       iter->idx += iter->keys->size - iter->keys->nr;
+       }
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
        struct journal_key *k = iter->keys->d + iter->idx;
 
-       while (k < iter->keys->d + iter->keys->nr &&
+       while (k < iter->keys->d + iter->keys->size &&
               k->btree_id      == iter->btree_id &&
               k->level         == iter->level) {
                if (!k->overwritten)
-                       return k->k;
+                       return bkey_i_to_s_c(k->k);
 
-               iter->idx++;
+               bch2_journal_iter_advance(iter);
                k = iter->keys->d + iter->idx;
        }
 
-       return NULL;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-       if (iter->idx < iter->keys->nr)
-               iter->idx++;
+       return bkey_s_c_null;
 }
 
 static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -260,71 +355,49 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 {
-       switch (iter->last) {
-       case none:
-               break;
-       case btree:
-               bch2_journal_iter_advance_btree(iter);
-               break;
-       case journal:
-               bch2_journal_iter_advance(&iter->journal);
-               break;
-       }
-
-       iter->last = none;
+       if (!bpos_cmp(iter->pos, SPOS_MAX))
+               iter->at_end = true;
+       else
+               iter->pos = bpos_successor(iter->pos);
 }
 
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
-       struct bkey_s_c ret;
-
-       while (1) {
-               struct bkey_s_c btree_k         =
-                       bch2_journal_iter_peek_btree(iter);
-               struct bkey_s_c journal_k       =
-                       bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
+       struct bkey_s_c btree_k, journal_k, ret;
+again:
+       if (iter->at_end)
+               return bkey_s_c_null;
 
-               if (btree_k.k && journal_k.k) {
-                       int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
+       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+              bpos_cmp(btree_k.k->p, iter->pos) < 0)
+               bch2_journal_iter_advance_btree(iter);
 
-                       if (!cmp)
-                               bch2_journal_iter_advance_btree(iter);
+       while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+              bpos_cmp(journal_k.k->p, iter->pos) < 0)
+               bch2_journal_iter_advance(&iter->journal);
 
-                       iter->last = cmp < 0 ? btree : journal;
-               } else if (btree_k.k) {
-                       iter->last = btree;
-               } else if (journal_k.k) {
-                       iter->last = journal;
-               } else {
-                       iter->last = none;
-                       return bkey_s_c_null;
-               }
+       ret = journal_k.k &&
+               (!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0)
+               ? journal_k
+               : btree_k;
 
-               ret = iter->last == journal ? journal_k : btree_k;
+       if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0)
+               ret = bkey_s_c_null;
 
-               if (iter->b &&
-                   bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
-                       iter->journal.idx = iter->journal.keys->nr;
-                       iter->last = none;
-                       return bkey_s_c_null;
+       if (ret.k) {
+               iter->pos = ret.k->p;
+               if (bkey_deleted(ret.k)) {
+                       bch2_btree_and_journal_iter_advance(iter);
+                       goto again;
                }
-
-               if (!bkey_deleted(ret.k))
-                       break;
-
-               bch2_btree_and_journal_iter_advance(iter);
+       } else {
+               iter->pos = SPOS_MAX;
+               iter->at_end = true;
        }
 
        return ret;
 }
 
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
-{
-       bch2_btree_and_journal_iter_advance(iter);
-
-       return bch2_btree_and_journal_iter_peek(iter);
-}
-
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 {
        bch2_journal_iter_exit(&iter->journal);
@@ -342,6 +415,8 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
        iter->node_iter = node_iter;
        bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
        INIT_LIST_HEAD(&iter->journal.list);
+       iter->pos = b->data->min_key;
+       iter->at_end = false;
 }
 
 /*
@@ -361,16 +436,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 
 /* sort and dedup all keys in the journal: */
 
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
 {
-
-       while (!list_empty(list)) {
-               struct journal_replay *i =
-                       list_first_entry(list, struct journal_replay, list);
-               list_del(&i->list);
-               kvpfree(i, offsetof(struct journal_replay, j) +
-                       vstruct_bytes(&i->j));
-       }
+       struct journal_replay **i;
+       struct genradix_iter iter;
+
+       genradix_for_each(&c->journal_entries, iter, i)
+               if (*i)
+                       kvpfree(*i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&(*i)->j));
+       genradix_free(&c->journal_entries);
 }
 
 /*
@@ -390,66 +465,68 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 {
        struct journal_key *i;
 
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
        for (i = keys->d; i < keys->d + keys->nr; i++)
                if (i->allocated)
                        kfree(i->k);
 
        kvfree(keys->d);
        keys->d = NULL;
-       keys->nr = 0;
+       keys->nr = keys->gap = keys->size = 0;
 }
 
-static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+static int journal_keys_sort(struct bch_fs *c)
 {
-       struct journal_replay *i;
+       struct genradix_iter iter;
+       struct journal_replay *i, **_i;
        struct jset_entry *entry;
        struct bkey_i *k, *_n;
-       struct journal_keys keys = { NULL };
+       struct journal_keys *keys = &c->journal_keys;
        struct journal_key *src, *dst;
        size_t nr_keys = 0;
 
-       if (list_empty(journal_entries))
-               return keys;
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
 
-       list_for_each_entry(i, journal_entries, list) {
-               if (i->ignore)
+               if (!i || i->ignore)
                        continue;
 
-               if (!keys.journal_seq_base)
-                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
-
                for_each_jset_key(k, _n, entry, &i->j)
                        nr_keys++;
        }
 
-       keys.size = roundup_pow_of_two(nr_keys);
+       if (!nr_keys)
+               return 0;
 
-       keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
-       if (!keys.d)
-               goto err;
+       keys->size = roundup_pow_of_two(nr_keys);
 
-       list_for_each_entry(i, journal_entries, list) {
-               if (i->ignore)
-                       continue;
+       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+       if (!keys->d)
+               return -ENOMEM;
 
-               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
 
                for_each_jset_key(k, _n, entry, &i->j)
-                       keys.d[keys.nr++] = (struct journal_key) {
+                       keys->d[keys->nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
                                .level          = entry->level,
                                .k              = k,
-                               .journal_seq    = le64_to_cpu(i->j.seq) -
-                                       keys.journal_seq_base,
+                               .journal_seq    = le64_to_cpu(i->j.seq),
                                .journal_offset = k->_data - i->j._data,
                        };
        }
 
-       sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+       sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
 
-       src = dst = keys.d;
-       while (src < keys.d + keys.nr) {
-               while (src + 1 < keys.d + keys.nr &&
+       src = dst = keys->d;
+       while (src < keys->d + keys->nr) {
+               while (src + 1 < keys->d + keys->nr &&
                       src[0].btree_id  == src[1].btree_id &&
                       src[0].level     == src[1].level &&
                       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
@@ -458,9 +535,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                *dst++ = *src++;
        }
 
-       keys.nr = dst - keys.d;
-err:
-       return keys;
+       keys->nr = dst - keys->d;
+       keys->gap = keys->nr;
+       return 0;
 }
 
 /* journal replay: */
@@ -468,7 +545,8 @@ err:
 static void replay_now_at(struct journal *j, u64 seq)
 {
        BUG_ON(seq < j->replay_journal_seq);
-       BUG_ON(seq > j->replay_journal_seq_end);
+
+       seq = min(seq, j->replay_journal_seq_end);
 
        while (j->replay_journal_seq < seq)
                bch2_journal_pin_put(j, j->replay_journal_seq++);
@@ -519,6 +597,9 @@ static int bch2_journal_replay(struct bch_fs *c)
        size_t i;
        int ret;
 
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
        keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
        if (!keys_sorted)
                return -ENOMEM;
@@ -530,22 +611,19 @@ static int bch2_journal_replay(struct bch_fs *c)
             sizeof(keys_sorted[0]),
             journal_sort_seq_cmp, NULL);
 
-       if (keys->nr)
-               replay_now_at(j, keys->journal_seq_base);
-
        for (i = 0; i < keys->nr; i++) {
                k = keys_sorted[i];
 
                cond_resched();
 
-               if (!k->allocated)
-                       replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+               replay_now_at(j, k->journal_seq);
 
                ret = bch2_trans_do(c, NULL, NULL,
                                    BTREE_INSERT_LAZY_RW|
                                    BTREE_INSERT_NOFAIL|
-                                   BTREE_INSERT_JOURNAL_RESERVED|
-                                   (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+                                   (!k->allocated
+                                    ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+                                    : 0),
                             bch2_journal_replay_key(&trans, k));
                if (ret) {
                        bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
@@ -560,6 +638,9 @@ static int bch2_journal_replay(struct bch_fs *c)
        bch2_journal_set_replay_done(j);
        bch2_journal_flush_all_pins(j);
        ret = bch2_journal_error(j);
+
+       if (keys->nr && !ret)
+               bch2_journal_log_msg(&c->journal, "journal replay finished");
 err:
        kvfree(keys_sorted);
        return ret;
@@ -630,7 +711,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
                unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
                ca->usage_base->buckets_ec              = le64_to_cpu(u->buckets_ec);
-               ca->usage_base->buckets_unavailable     = le64_to_cpu(u->buckets_unavailable);
 
                for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
                        ca->usage_base->d[i].buckets    = le64_to_cpu(u->d[i].buckets);
@@ -670,10 +750,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
 }
 
 static int journal_replay_early(struct bch_fs *c,
-                               struct bch_sb_field_clean *clean,
-                               struct list_head *journal)
+                               struct bch_sb_field_clean *clean)
 {
-       struct journal_replay *i;
        struct jset_entry *entry;
        int ret;
 
@@ -686,8 +764,13 @@ static int journal_replay_early(struct bch_fs *c,
                                return ret;
                }
        } else {
-               list_for_each_entry(i, journal, list) {
-                       if (i->ignore)
+               struct genradix_iter iter;
+               struct journal_replay *i, **_i;
+
+               genradix_for_each(&c->journal_entries, iter, _i) {
+                       i = *_i;
+
+                       if (!i || i->ignore)
                                continue;
 
                        vstruct_for_each(&i->j, entry) {
@@ -742,6 +825,8 @@ static int verify_superblock_clean(struct bch_fs *c,
 {
        unsigned i;
        struct bch_sb_field_clean *clean = *cleanp;
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
        int ret = 0;
 
        if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -754,7 +839,6 @@ static int verify_superblock_clean(struct bch_fs *c,
        }
 
        for (i = 0; i < BTREE_ID_NR; i++) {
-               char buf1[200], buf2[200];
                struct bkey_i *k1, *k2;
                unsigned l1 = 0, l2 = 0;
 
@@ -764,6 +848,19 @@ static int verify_superblock_clean(struct bch_fs *c,
                if (!k1 && !k2)
                        continue;
 
+               printbuf_reset(&buf1);
+               printbuf_reset(&buf2);
+
+               if (k1)
+                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+               else
+                       prt_printf(&buf1, "(none)");
+
+               if (k2)
+                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+               else
+                       prt_printf(&buf2, "(none)");
+
                mustfix_fsck_err_on(!k1 || !k2 ||
                                    IS_ERR(k1) ||
                                    IS_ERR(k2) ||
@@ -773,10 +870,12 @@ static int verify_superblock_clean(struct bch_fs *c,
                        "superblock btree root %u doesn't match journal after clean shutdown\n"
                        "sb:      l=%u %s\n"
                        "journal: l=%u %s\n", i,
-                       l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
-                       l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
+                       l1, buf1.buf,
+                       l2, buf2.buf);
        }
 fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
        return ret;
 }
 
@@ -803,7 +902,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
                return ERR_PTR(-ENOMEM);
        }
 
-       ret = bch2_sb_clean_validate(c, clean, READ);
+       ret = bch2_sb_clean_validate_late(c, clean, READ);
        if (ret) {
                mutex_unlock(&c->sb_lock);
                return ERR_PTR(ret);
@@ -817,6 +916,19 @@ fsck_err:
        return ERR_PTR(ret);
 }
 
+static bool btree_id_is_alloc(enum btree_id id)
+{
+       switch (id) {
+       case BTREE_ID_alloc:
+       case BTREE_ID_backpointers:
+       case BTREE_ID_need_discard:
+       case BTREE_ID_freespace:
+               return true;
+       default:
+               return false;
+       }
+}
+
 static int read_btree_roots(struct bch_fs *c)
 {
        unsigned i;
@@ -828,14 +940,14 @@ static int read_btree_roots(struct bch_fs *c)
                if (!r->alive)
                        continue;
 
-               if (i == BTREE_ID_alloc &&
+               if (btree_id_is_alloc(i) &&
                    c->opts.reconstruct_alloc) {
                        c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                        continue;
                }
 
                if (r->error) {
-                       __fsck_err(c, i == BTREE_ID_alloc
+                       __fsck_err(c, btree_id_is_alloc(i)
                                   ? FSCK_CAN_IGNORE : 0,
                                   "invalid btree root %s",
                                   bch2_btree_ids[i]);
@@ -845,7 +957,8 @@ static int read_btree_roots(struct bch_fs *c)
 
                ret = bch2_btree_root_read(c, i, &r->key, r->level);
                if (ret) {
-                       __fsck_err(c, i == BTREE_ID_alloc
+                       __fsck_err(c,
+                                  btree_id_is_alloc(i)
                                   ? FSCK_CAN_IGNORE : 0,
                                   "error reading btree root %s",
                                   bch2_btree_ids[i]);
@@ -881,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
        if (ret)
                return ret;
 
-
        bkey_subvolume_init(&root_volume.k_i);
        root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
        root_volume.v.flags     = 0;
@@ -974,28 +1086,24 @@ int bch2_fs_recovery(struct bch_fs *c)
                c->opts.fix_errors = FSCK_OPT_YES;
        }
 
-       if (!c->replicas.entries ||
-           c->opts.rebuild_replicas) {
-               bch_info(c, "building replicas info");
-               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-       }
-
        if (!c->opts.nochanges) {
-               if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
-                       bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+               if (c->sb.version < bcachefs_metadata_version_backpointers) {
+                       bch_info(c, "version prior to backpointers, upgrade and fsck required");
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
                        c->opts.fix_errors      = FSCK_OPT_YES;
-               } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
-                       bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
-                       c->opts.version_upgrade = true;
-                       c->opts.fsck            = true;
-               } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
-                       bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
-                       c->opts.version_upgrade = true;
+               } else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
+                       bch_info(c, "version prior to inode_v3, upgrade required");
+                       c->opts.version_upgrade = true;
                }
        }
 
+       if (c->opts.fsck && c->opts.norecovery) {
+               bch_err(c, "cannot select both norecovery and fsck");
+               ret = -EINVAL;
+               goto err;
+       }
+
        ret = bch2_blacklist_table_initialize(c);
        if (ret) {
                bch_err(c, "error initializing blacklist table");
@@ -1003,17 +1111,17 @@ int bch2_fs_recovery(struct bch_fs *c)
        }
 
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct journal_replay *i;
+               struct genradix_iter iter;
+               struct journal_replay **i;
 
                bch_verbose(c, "starting journal read");
-               ret = bch2_journal_read(c, &c->journal_entries,
-                                       &blacklist_seq, &journal_seq);
+               ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
                if (ret)
                        goto err;
 
-               list_for_each_entry_reverse(i, &c->journal_entries, list)
-                       if (!i->ignore) {
-                               last_journal_entry = &i->j;
+               genradix_for_each_reverse(&c->journal_entries, iter, i)
+                       if (*i && !(*i)->ignore) {
+                               last_journal_entry = &(*i)->j;
                                break;
                        }
 
@@ -1031,11 +1139,9 @@ int bch2_fs_recovery(struct bch_fs *c)
                        goto use_clean;
                }
 
-               c->journal_keys = journal_keys_sort(&c->journal_entries);
-               if (!c->journal_keys.d) {
-                       ret = -ENOMEM;
+               ret = journal_keys_sort(c);
+               if (ret)
                        goto err;
-               }
 
                if (c->sb.clean && last_journal_entry) {
                        ret = verify_superblock_clean(c, &clean,
@@ -1047,7 +1153,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 use_clean:
                if (!clean) {
                        bch_err(c, "no superblock clean section found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+                       ret = -BCH_ERR_fsck_repair_impossible;
                        goto err;
 
                }
@@ -1061,7 +1167,7 @@ use_clean:
 
        zero_out_btree_mem_ptr(&c->journal_keys);
 
-       ret = journal_replay_early(c, clean, &c->journal_entries);
+       ret = journal_replay_early(c, clean);
        if (ret)
                goto err;
 
@@ -1084,11 +1190,24 @@ use_clean:
                }
        }
 
-       ret = bch2_fs_journal_start(&c->journal, journal_seq,
-                                   &c->journal_entries);
+       /*
+        * note: cmd_list_journal needs the blacklist table fully up to date so
+        * it can asterisk ignored journal entries:
+        */
+       if (c->opts.read_journal_only)
+               goto out;
+
+       ret = bch2_fs_journal_start(&c->journal, journal_seq);
        if (ret)
                goto err;
 
+       /*
+        * Skip past versions that might have possibly been used (as nonces),
+        * but hadn't had their pointers written:
+        */
+       if (c->sb.encryption_type && !c->sb.clean)
+               atomic64_add(1 << 16, &c->key_version);
+
        ret = read_btree_roots(c);
        if (ret)
                goto err;
@@ -1097,7 +1216,7 @@ use_clean:
        err = "error reading allocation information";
 
        down_read(&c->gc_lock);
-       ret = bch2_alloc_read(c, false, false);
+       ret = bch2_alloc_read(c);
        up_read(&c->gc_lock);
 
        if (ret)
@@ -1111,51 +1230,98 @@ use_clean:
                goto err;
        bch_verbose(c, "stripes_read done");
 
-       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
-       /*
-        * If we're not running fsck, this ensures bch2_fsck_err() calls are
-        * instead interpreted as bch2_inconsistent_err() calls:
-        */
-       if (!c->opts.fsck)
-               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+       bch2_stripes_heap_start(c);
 
-       if (c->opts.fsck ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
-           test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+       if (c->opts.fsck) {
                bool metadata_only = c->opts.norecovery;
 
                bch_info(c, "checking allocations");
-               err = "error in mark and sweep";
+               err = "error checking allocations";
                ret = bch2_gc(c, true, metadata_only);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking allocations");
-       }
 
-       bch2_stripes_heap_start(c);
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
-       clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+               bch_info(c, "checking need_discard and freespace btrees");
+               err = "error checking need_discard and freespace btrees";
+               ret = bch2_check_alloc_info(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking need_discard and freespace btrees");
 
-       /*
-        * Skip past versions that might have possibly been used (as nonces),
-        * but hadn't had their pointers written:
-        */
-       if (c->sb.encryption_type && !c->sb.clean)
-               atomic64_add(1 << 16, &c->key_version);
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
-       if (c->opts.norecovery)
-               goto out;
+               bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
+
+               bch_info(c, "checking lrus");
+               err = "error checking lrus";
+               ret = bch2_check_lrus(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking lrus");
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+
+               bch_info(c, "checking backpointers to alloc keys");
+               err = "error checking backpointers to alloc keys";
+               ret = bch2_check_btree_backpointers(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking backpointers to alloc keys");
+
+               bch_info(c, "checking backpointers to extents");
+               err = "error checking backpointers to extents";
+               ret = bch2_check_backpointers_to_extents(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking backpointers to extents");
+
+               bch_info(c, "checking extents to backpointers");
+               err = "error checking extents to backpointers";
+               ret = bch2_check_extents_to_backpointers(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking extents to backpointers");
+               set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
+               bch_info(c, "checking alloc to lru refs");
+               err = "error checking alloc to lru refs";
+               ret = bch2_check_alloc_to_lru_refs(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking alloc to lru refs");
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+       } else {
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+               if (c->opts.norecovery)
+                       goto out;
+
+               bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
+       }
 
-       bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-       err = "journal replay failed";
-       ret = bch2_journal_replay(c);
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
        if (ret)
                goto err;
-       if (c->opts.verbose || !c->sb.clean)
-               bch_info(c, "journal replay done");
 
        if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
                bch2_fs_lazy_rw(c);
@@ -1259,13 +1425,19 @@ out:
 
        if (!c->opts.keep_journal) {
                bch2_journal_keys_free(&c->journal_keys);
-               bch2_journal_entries_free(&c->journal_entries);
+               bch2_journal_entries_free(c);
        }
        kfree(clean);
+
+       if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+               bch2_fs_read_write_early(c);
+               bch2_delete_dead_snapshots_async(c);
+       }
+
        if (ret)
-               bch_err(c, "Error in recovery: %s (%i)", err, ret);
+               bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
        else
-               bch_verbose(c, "ret %i", ret);
+               bch_verbose(c, "ret %s", bch2_err_str(ret));
        return ret;
 err:
 fsck_err:
@@ -1280,7 +1452,6 @@ int bch2_fs_initialize(struct bch_fs *c)
        struct qstr lostfound = QSTR("lost+found");
        const char *err = "cannot allocate memory";
        struct bch_dev *ca;
-       LIST_HEAD(journal);
        unsigned i;
        int ret;
 
@@ -1290,6 +1461,9 @@ int bch2_fs_initialize(struct bch_fs *c)
        c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
        c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
+       if (c->sb.version < bcachefs_metadata_version_inode_v3)
+               c->opts.version_upgrade = true;
+
        if (c->opts.version_upgrade) {
                c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
                c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1297,13 +1471,16 @@ int bch2_fs_initialize(struct bch_fs *c)
        }
        mutex_unlock(&c->sb_lock);
 
-       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
        set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
        for (i = 0; i < BTREE_ID_NR; i++)
                bch2_btree_root_alloc(c, i);
 
+       for_each_online_member(ca, c, i)
+               bch2_dev_usage_init(ca);
+
        err = "unable to allocate journal buckets";
        for_each_online_member(ca, c, i) {
                ret = bch2_dev_journal_alloc(ca);
@@ -1317,7 +1494,7 @@ int bch2_fs_initialize(struct bch_fs *c)
         * journal_res_get() will crash if called before this has
         * set up the journal.pin FIFO and journal.cur pointer:
         */
-       bch2_fs_journal_start(&c->journal, 1, &journal);
+       bch2_fs_journal_start(&c->journal, 1);
        bch2_journal_set_replay_done(&c->journal);
 
        err = "error going read-write";
@@ -1329,6 +1506,7 @@ int bch2_fs_initialize(struct bch_fs *c)
         * Write out the superblock and journal buckets, now that we can do
         * btree updates
         */
+       bch_verbose(c, "marking superblocks");
        err = "error marking superblock and journal";
        for_each_member_device(ca, c, i) {
                ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1340,6 +1518,12 @@ int bch2_fs_initialize(struct bch_fs *c)
                ca->new_fs_bucket_idx = 0;
        }
 
+       bch_verbose(c, "initializing freespace");
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
+       if (ret)
+               goto err;
+
        err = "error creating root snapshot node";
        ret = bch2_fs_initialize_subvolumes(c);
        if (ret)
@@ -1356,7 +1540,7 @@ int bch2_fs_initialize(struct bch_fs *c)
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum      = BCACHEFS_ROOT_INO;
        root_inode.bi_subvol    = BCACHEFS_ROOT_SUBVOL;
-       bch2_inode_pack(c, &packed_inode, &root_inode);
+       bch2_inode_pack(&packed_inode, &root_inode);
        packed_inode.inode.k.p.snapshot = U32_MAX;
 
        err = "error creating root directory";
index 21bdad9db2493668b0f5c2a6bcf6f2913315b8db..8c0348e8b84cf00de92fcfebc588a64957d2253a 100644 (file)
@@ -2,9 +2,6 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-#define for_each_journal_key(keys, i)                          \
-       for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
-
 struct journal_iter {
        struct list_head        list;
        enum btree_id           btree_id;
@@ -23,16 +20,14 @@ struct btree_and_journal_iter {
        struct bkey             unpacked;
 
        struct journal_iter     journal;
-
-       enum last_key_returned {
-               none,
-               btree,
-               journal,
-       }                       last;
+       struct bpos             pos;
+       bool                    at_end;
 };
 
-size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
-                              unsigned, struct bpos);
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+                               unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+                                          unsigned, struct bpos);
 
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
                                 unsigned, struct bkey_i *);
@@ -45,7 +40,6 @@ void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
@@ -56,7 +50,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
                                                struct btree *);
 
 void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
index c8d6d73681e010c0ec00221a7a473070c4680b0a..d5c14bb2992d5d7fc4281a207140861a0cdefd1f 100644 (file)
@@ -25,18 +25,25 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 
 /* reflink pointers */
 
-const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-       if (bkey_val_bytes(p.k) != sizeof(*p.v))
-               return "incorrect value size";
+       if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(p.k), sizeof(*p.v));
+               return -EINVAL;
+       }
 
        if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
-           le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
-               return "idx < front_pad";
+           le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+               prt_printf(err, "idx < front_pad (%llu < %u)",
+                      le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -44,7 +51,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-       pr_buf(out, "idx %llu front_pad %u back_pad %u",
+       prt_printf(out, "idx %llu front_pad %u back_pad %u",
               le64_to_cpu(p.v->idx),
               le32_to_cpu(p.v->front_pad),
               le32_to_cpu(p.v->back_pad));
@@ -70,14 +77,18 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
 /* indirect extents */
 
-const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
        struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-       if (bkey_val_bytes(r.k) < sizeof(*r.v))
-               return "incorrect value size";
+       if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(r.k), sizeof(*r.v));
+               return -EINVAL;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -85,7 +96,7 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-       pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+       prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
 
        bch2_bkey_ptrs_to_text(out, c, k);
 }
@@ -98,14 +109,37 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
        return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
 
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c old, struct bkey_i *new,
+                             unsigned flags)
+{
+       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+               struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+               if (!r->v.refcount) {
+                       r->k.type = KEY_TYPE_deleted;
+                       r->k.size = 0;
+                       set_bkey_val_u64s(&r->k, 0);
+                       return 0;
+               }
+       }
+
+       return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
 /* indirect inline data */
 
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
-                                             struct bkey_s_c k)
+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                                     int rw, struct printbuf *err)
 {
-       if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
-               return "incorrect value size";
-       return NULL;
+       if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
+               return -EINVAL;
+       }
+
+       return 0;
 }
 
 void bch2_indirect_inline_data_to_text(struct printbuf *out,
@@ -114,11 +148,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
        struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
        unsigned datalen = bkey_inline_data_bytes(k.k);
 
-       pr_buf(out, "refcount %llu datalen %u: %*phN",
+       prt_printf(out, "refcount %llu datalen %u: %*phN",
               le64_to_cpu(d.v->refcount), datalen,
               min(datalen, 32U), d.v->data);
 }
 
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c old, struct bkey_i *new,
+                             unsigned flags)
+{
+       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+               struct bkey_i_indirect_inline_data *r =
+                       bkey_i_to_indirect_inline_data(new);
+
+               if (!r->v.refcount) {
+                       r->k.type = KEY_TYPE_deleted;
+                       r->k.size = 0;
+                       set_bkey_val_u64s(&r->k, 0);
+               }
+       }
+
+       return 0;
+}
+
 static int bch2_make_extent_indirect(struct btree_trans *trans,
                                     struct btree_iter *extent_iter,
                                     struct bkey_i *orig)
@@ -229,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        u32 dst_snapshot, src_snapshot;
        int ret = 0, ret2 = 0;
 
-       if (!percpu_ref_tryget(&c->writes))
+       if (!percpu_ref_tryget_live(&c->writes))
                return -EROFS;
 
        bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -246,7 +299,8 @@ s64 bch2_remap_range(struct bch_fs *c,
        bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
                             BTREE_ITER_INTENT);
 
-       while ((ret == 0 || ret == -EINTR) &&
+       while ((ret == 0 ||
+               bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
               bkey_cmp(dst_iter.pos, dst_end) < 0) {
                struct disk_reservation disk_res = { 0 };
 
@@ -356,7 +410,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                }
 
                bch2_trans_iter_exit(&trans, &inode_iter);
-       } while (ret2 == -EINTR);
+       } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&new_src, c);
index 3745873fd88d90947f610de256931cecec4d9181..f9848dc3eebbaeb770048d6c375d0a829a0f0d64 100644 (file)
@@ -2,7 +2,8 @@
 #ifndef _BCACHEFS_REFLINK_H
 #define _BCACHEFS_REFLINK_H
 
-const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+                          int, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -10,27 +11,39 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 #define bch2_bkey_ops_reflink_p (struct bkey_ops) {            \
        .key_invalid    = bch2_reflink_p_invalid,               \
        .val_to_text    = bch2_reflink_p_to_text,               \
-       .key_merge      = bch2_reflink_p_merge,         \
+       .key_merge      = bch2_reflink_p_merge,                 \
+       .trans_trigger  = bch2_trans_mark_reflink_p,            \
+       .atomic_trigger = bch2_mark_reflink_p,                  \
 }
 
-const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+                          int, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+                             struct bkey_s_c, struct bkey_i *, unsigned);
 
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {            \
        .key_invalid    = bch2_reflink_v_invalid,               \
        .val_to_text    = bch2_reflink_v_to_text,               \
        .swab           = bch2_ptr_swab,                        \
+       .trans_trigger  = bch2_trans_mark_reflink_v,            \
+       .atomic_trigger = bch2_mark_extent,                     \
 }
 
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
-                                             struct bkey_s_c);
+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+                                     int, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
                                struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+                                        enum btree_id, unsigned,
+                             struct bkey_s_c, struct bkey_i *,
+                             unsigned);
 
 #define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
        .key_invalid    = bch2_indirect_inline_data_invalid,    \
        .val_to_text    = bch2_indirect_inline_data_to_text,    \
+       .trans_trigger  = bch2_trans_mark_indirect_inline_data, \
 }
 
 static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
index 96994b7a75a555699fc0aa2c2745f8353cdeeaba..fcf73d72303505f760d56949cda3e7a4e2383114 100644 (file)
@@ -36,20 +36,36 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
        eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
+void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+                                   struct bch_replicas_entry_v0 *e)
+{
+       unsigned i;
+
+       if (e->data_type < BCH_DATA_NR)
+               prt_printf(out, "%s", bch2_data_types[e->data_type]);
+       else
+               prt_printf(out, "(invalid data type %u)", e->data_type);
+
+       prt_printf(out, ": %u [", e->nr_devs);
+       for (i = 0; i < e->nr_devs; i++)
+               prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+       prt_printf(out, "]");
+}
+
 void bch2_replicas_entry_to_text(struct printbuf *out,
                                 struct bch_replicas_entry *e)
 {
        unsigned i;
 
        if (e->data_type < BCH_DATA_NR)
-               pr_buf(out, "%s", bch2_data_types[e->data_type]);
+               prt_printf(out, "%s", bch2_data_types[e->data_type]);
        else
-               pr_buf(out, "(invalid data type %u)", e->data_type);
+               prt_printf(out, "(invalid data type %u)", e->data_type);
 
-       pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+       prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
        for (i = 0; i < e->nr_devs; i++)
-               pr_buf(out, i ? " %u" : "%u", e->devs[i]);
-       pr_buf(out, "]");
+               prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+       prt_printf(out, "]");
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -60,7 +76,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 
        for_each_cpu_replicas_entry(r, e) {
                if (!first)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
                first = false;
 
                bch2_replicas_entry_to_text(out, e);
@@ -462,7 +478,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
                    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
                        n = cpu_replicas_add_entry(&c->replicas_gc, e);
                        if (!n.entries) {
-                               ret = -ENOSPC;
+                               ret = -ENOMEM;
                                goto err;
                        }
 
@@ -471,10 +487,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
                }
        }
 
-       if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
-               ret = -ENOSPC;
+       ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+       if (ret)
                goto err;
-       }
 
        ret = replicas_table_update(c, &c->replicas_gc);
 err:
@@ -577,10 +592,9 @@ retry:
 
        bch2_cpu_replicas_sort(&new);
 
-       if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
-               ret = -ENOSPC;
+       ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+       if (ret)
                goto err;
-       }
 
        ret = replicas_table_update(c, &new);
 err:
@@ -735,7 +749,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
        sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
                        DIV_ROUND_UP(bytes, sizeof(u64)));
        if (!sb_r)
-               return -ENOSPC;
+               return -BCH_ERR_ENOSPC_sb_replicas;
 
        bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
        sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
@@ -780,7 +794,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
        sb_r = bch2_sb_resize_replicas(&c->disk_sb,
                        DIV_ROUND_UP(bytes, sizeof(u64)));
        if (!sb_r)
-               return -ENOSPC;
+               return -BCH_ERR_ENOSPC_sb_replicas;
 
        bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
        sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
@@ -818,27 +832,27 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                        cpu_replicas_entry(cpu_r, i);
 
                if (e->data_type >= BCH_DATA_NR) {
-                       pr_buf(err, "invalid data type in entry ");
+                       prt_printf(err, "invalid data type in entry ");
                        bch2_replicas_entry_to_text(err, e);
                        return -EINVAL;
                }
 
                if (!e->nr_devs) {
-                       pr_buf(err, "no devices in entry ");
+                       prt_printf(err, "no devices in entry ");
                        bch2_replicas_entry_to_text(err, e);
                        return -EINVAL;
                }
 
                if (e->nr_required > 1 &&
                    e->nr_required >= e->nr_devs) {
-                       pr_buf(err, "bad nr_required in entry ");
+                       prt_printf(err, "bad nr_required in entry ");
                        bch2_replicas_entry_to_text(err, e);
                        return -EINVAL;
                }
 
                for (j = 0; j < e->nr_devs; j++)
                        if (!bch2_dev_exists(sb, mi, e->devs[j])) {
-                               pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+                               prt_printf(err, "invalid device %u in entry ", e->devs[j]);
                                bch2_replicas_entry_to_text(err, e);
                                return -EINVAL;
                        }
@@ -850,7 +864,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                        BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
 
                        if (!memcmp(e, n, cpu_r->entry_size)) {
-                               pr_buf(err, "duplicate replicas entry ");
+                               prt_printf(err, "duplicate replicas entry ");
                                bch2_replicas_entry_to_text(err, e);
                                return -EINVAL;
                        }
@@ -860,7 +874,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
        return 0;
 }
 
-static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
                                     struct printbuf *err)
 {
        struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
@@ -885,19 +899,20 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 
        for_each_replicas_entry(r, e) {
                if (!first)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
                first = false;
 
                bch2_replicas_entry_to_text(out, e);
        }
+       prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-       .validate       = bch2_sb_validate_replicas,
+       .validate       = bch2_sb_replicas_validate,
        .to_text        = bch2_sb_replicas_to_text,
 };
 
-static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
                                        struct printbuf *err)
 {
        struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
@@ -912,8 +927,27 @@ static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *
        return ret;
 }
 
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+                                       struct bch_sb *sb,
+                                       struct bch_sb_field *f)
+{
+       struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+       struct bch_replicas_entry_v0 *e;
+       bool first = true;
+
+       for_each_replicas_entry(sb_r, e) {
+               if (!first)
+                       prt_printf(out, " ");
+               first = false;
+
+               bch2_replicas_entry_v0_to_text(out, e);
+       }
+       prt_newline(out);
+}
+
 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
-       .validate       = bch2_sb_validate_replicas_v0,
+       .validate       = bch2_sb_replicas_v0_validate,
+       .to_text        = bch2_sb_replicas_v0_to_text,
 };
 
 /* Query replicas: */
@@ -954,11 +988,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 
                if (dflags & ~flags) {
                        if (print) {
-                               char buf[100];
+                               struct printbuf buf = PRINTBUF;
 
-                               bch2_replicas_entry_to_text(&PBUF(buf), e);
+                               bch2_replicas_entry_to_text(&buf, e);
                                bch_err(c, "insufficient devices online (%u) for replicas entry %s",
-                                       nr_online, buf);
+                                       nr_online, buf.buf);
+                               printbuf_exit(&buf);
                        }
                        ret = false;
                        break;
@@ -970,19 +1005,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
        return ret;
 }
 
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 {
-       struct bch_replicas_entry *e;
-       unsigned i, ret = 0;
+       struct bch_sb_field_replicas *replicas;
+       struct bch_sb_field_replicas_v0 *replicas_v0;
+       unsigned i, data_has = 0;
+
+       replicas = bch2_sb_get_replicas(sb);
+       replicas_v0 = bch2_sb_get_replicas_v0(sb);
+
+       if (replicas) {
+               struct bch_replicas_entry *r;
+
+               for_each_replicas_entry(replicas, r)
+                       for (i = 0; i < r->nr_devs; i++)
+                               if (r->devs[i] == dev)
+                                       data_has |= 1 << r->data_type;
+       } else if (replicas_v0) {
+               struct bch_replicas_entry_v0 *r;
+
+               for_each_replicas_entry_v0(replicas_v0, r)
+                       for (i = 0; i < r->nr_devs; i++)
+                               if (r->devs[i] == dev)
+                                       data_has |= 1 << r->data_type;
+       }
 
-       percpu_down_read(&c->mark_lock);
 
-       for_each_cpu_replicas_entry(&c->replicas, e)
-               for (i = 0; i < e->nr_devs; i++)
-                       if (e->devs[i] == ca->dev_idx)
-                               ret |= 1 << e->data_type;
+       return data_has;
+}
 
-       percpu_up_read(&c->mark_lock);
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+       unsigned ret;
+
+       mutex_lock(&c->sb_lock);
+       ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+       mutex_unlock(&c->sb_lock);
 
        return ret;
 }
index d237d7c51ccb9b9faa771e72ba123fb505914c16..cc34b3809206fb1f5666ba41ad42b7958a39e307 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "bkey.h"
 #include "eytzinger.h"
 #include "replicas_types.h"
 
@@ -64,6 +65,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
                           unsigned, bool);
 
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
index c062edb3fbc24e6fd5889d1ac138b3e2c10ab9db..dc1a27cc31cd4de56cdc2e44026e4be7b789c34f 100644 (file)
@@ -160,7 +160,7 @@ u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
 
        r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
        memset(ctx, 0, sizeof(*ctx));
-       return (r);
+       return r;
 }
 
 u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
index 57d636740d2f81ffa8a19adf9dd3d3f036746a65..6178ae620ff1fdb8c1999cdbd5664538fc9a7578 100644 (file)
@@ -144,7 +144,9 @@ struct bch_hash_desc {
 static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
 {
        return k.k->type == desc.key_type &&
-               (!desc.is_visible || desc.is_visible(inum, k));
+               (!desc.is_visible ||
+                !inum.inum ||
+                desc.is_visible(inum, k));
 }
 
 static __always_inline int
@@ -163,12 +165,10 @@ bch2_hash_lookup(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+       for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
                           SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+                          POS(inum.inum, U64_MAX),
                           BTREE_ITER_SLOTS|flags, k, ret) {
-               if (iter->pos.inode != inum.inum)
-                       break;
-
                if (is_visible_key(desc, inum, k)) {
                        if (!desc.cmp_key(k, key))
                                return 0;
@@ -199,18 +199,15 @@ bch2_hash_hole(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+       for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
                           SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter->pos.inode != inum.inum)
-                       break;
-
+                          POS(inum.inum, U64_MAX),
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
                if (!is_visible_key(desc, inum, k))
                        return 0;
-       }
        bch2_trans_iter_exit(trans, iter);
 
-       return ret ?: -ENOSPC;
+       return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
 }
 
 static __always_inline
@@ -244,30 +241,25 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
-                 const struct bch_hash_desc desc,
-                 const struct bch_hash_info *info,
-                 subvol_inum inum,
-                 struct bkey_i *insert, int flags)
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+                          const struct bch_hash_desc desc,
+                          const struct bch_hash_info *info,
+                          subvol_inum inum, u32 snapshot,
+                          struct bkey_i *insert,
+                          int flags,
+                          int update_flags)
 {
        struct btree_iter iter, slot = { NULL };
        struct bkey_s_c k;
        bool found = false;
-       u32 snapshot;
        int ret;
 
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               return ret;
-
-       for_each_btree_key_norestart(trans, iter, desc.btree_id,
-                          SPOS(inum.inum,
+       for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+                          SPOS(insert->k.p.inode,
                                desc.hash_bkey(info, bkey_i_to_s_c(insert)),
                                snapshot),
+                          POS(insert->k.p.inode, U64_MAX),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter.pos.inode != inum.inum)
-                       break;
-
                if (is_visible_key(desc, inum, k)) {
                        if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
                                goto found;
@@ -285,7 +277,7 @@ int bch2_hash_set(struct btree_trans *trans,
        }
 
        if (!ret)
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_str_hash_create;
 out:
        bch2_trans_iter_exit(trans, &slot);
        bch2_trans_iter_exit(trans, &iter);
@@ -310,6 +302,26 @@ not_found:
        goto out;
 }
 
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+                 const struct bch_hash_desc desc,
+                 const struct bch_hash_info *info,
+                 subvol_inum inum,
+                 struct bkey_i *insert, int flags)
+{
+       u32 snapshot;
+       int ret;
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       insert->k.p.inode = inum.inum;
+
+       return bch2_hash_set_snapshot(trans, desc, info, inum,
+                                     snapshot, insert, flags, 0);
+}
+
 static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
                        const struct bch_hash_desc desc,
index 69603327d93df6587f4e8713d249c240c8bc1fde..8c98bacca290b8301f421a16707eebf811e32126 100644 (file)
@@ -3,21 +3,19 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "subvolume.h"
 
 /* Snapshot tree: */
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *);
-static void bch2_delete_dead_snapshots(struct bch_fs *);
-
 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
                           struct bkey_s_c k)
 {
        struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 
-       pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+       prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
               BCH_SNAPSHOT_SUBVOL(s.v),
               BCH_SNAPSHOT_DELETED(s.v),
               le32_to_cpu(s.v->parent),
@@ -26,39 +24,55 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
               le32_to_cpu(s.v->subvol));
 }
 
-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
        struct bkey_s_c_snapshot s;
        u32 i, id;
 
        if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
-           bkey_cmp(k.k->p, POS(0, 1)) < 0)
-               return "bad pos";
+           bkey_cmp(k.k->p, POS(0, 1)) < 0) {
+               prt_printf(err, "bad pos");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
-               return "bad val size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
+               prt_printf(err, "bad val size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
+               return -EINVAL;
+       }
 
        s = bkey_s_c_to_snapshot(k);
 
        id = le32_to_cpu(s.v->parent);
-       if (id && id <= k.k->p.offset)
-               return "bad parent node";
+       if (id && id <= k.k->p.offset) {
+               prt_printf(err, "bad parent node (%u <= %llu)",
+                      id, k.k->p.offset);
+               return -EINVAL;
+       }
 
-       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
-               return "children not normalized";
+       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+               prt_printf(err, "children not normalized");
+               return -EINVAL;
+       }
 
        if (s.v->children[0] &&
-           s.v->children[0] == s.v->children[1])
-               return "duplicate child nodes";
+           s.v->children[0] == s.v->children[1]) {
+               prt_printf(err, "duplicate child nodes");
+               return -EINVAL;
+       }
 
        for (i = 0; i < 2; i++) {
                id = le32_to_cpu(s.v->children[i]);
 
-               if (id >= k.k->p.offset)
-                       return "bad child node";
+               if (id >= k.k->p.offset) {
+                       prt_printf(err, "bad child node (%u >= %llu)",
+                              id, k.k->p.offset);
+                       return -EINVAL;
+               }
        }
 
-       return NULL;
+       return 0;
 }
 
 int bch2_mark_snapshot(struct btree_trans *trans,
@@ -118,7 +132,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
        if (!id)
                return 0;
 
-       ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+       ret = snapshot_lookup(trans, id, &v);
        if (ret == -ENOENT)
                bch_err(trans->c, "snapshot node %u not found", id);
        if (ret)
@@ -127,156 +141,206 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
        return !BCH_SNAPSHOT_DELETED(&v);
 }
 
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       unsigned i, nr_live = 0, live_idx = 0;
        struct bkey_s_c_snapshot snap;
-       unsigned i;
-       int ret;
+       u32 id = k.k->p.offset, child[2];
 
-       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               u32 id = k.k->p.offset, child[2];
-               unsigned nr_live = 0, live_idx;
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
+       snap = bkey_s_c_to_snapshot(k);
 
-               snap = bkey_s_c_to_snapshot(k);
-               child[0] = le32_to_cpu(snap.v->children[0]);
-               child[1] = le32_to_cpu(snap.v->children[1]);
+       child[0] = le32_to_cpu(snap.v->children[0]);
+       child[1] = le32_to_cpu(snap.v->children[1]);
 
-               for (i = 0; i < 2; i++) {
-                       ret = snapshot_live(trans, child[i]);
-                       if (ret < 0)
-                               break;
-
-                       if (ret)
-                               live_idx = i;
-                       nr_live += ret;
-               }
+       for (i = 0; i < 2; i++) {
+               int ret = snapshot_live(trans, child[i]);
+               if (ret < 0)
+                       return ret;
 
-               snapshot_t(c, id)->equiv = nr_live == 1
-                       ? snapshot_t(c, child[live_idx])->equiv
-                       : id;
+               if (ret)
+                       live_idx = i;
+               nr_live += ret;
        }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               bch_err(c, "error walking snapshots: %i", ret);
 
-       return ret;
+       snapshot_t(c, id)->equiv = nr_live == 1
+               ? snapshot_t(c, child[live_idx])->equiv
+               : id;
+       return 0;
 }
 
 /* fsck: */
-static int bch2_snapshot_check(struct btree_trans *trans,
-                              struct bkey_s_c_snapshot s)
+static int check_snapshot(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         struct bkey_s_c k)
 {
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c_snapshot s;
        struct bch_subvolume subvol;
        struct bch_snapshot v;
+       struct printbuf buf = PRINTBUF;
+       bool should_have_subvol;
        u32 i, id;
-       int ret;
-
-       id = le32_to_cpu(s.v->subvol);
-       ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
-       if (ret == -ENOENT)
-               bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
-                       s.k->p.offset, id);
-       if (ret)
-               return ret;
+       int ret = 0;
 
-       if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
-               bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-                       s.k->p.offset);
-               return -EINVAL;
-       }
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
+       s = bkey_s_c_to_snapshot(k);
        id = le32_to_cpu(s.v->parent);
        if (id) {
-               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               ret = snapshot_lookup(trans, id, &v);
                if (ret == -ENOENT)
-                       bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
-                               s.k->p.offset, id);
+                       bch_err(c, "snapshot with nonexistent parent:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
                if (ret)
-                       return ret;
+                       goto err;
 
                if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
                    le32_to_cpu(v.children[1]) != s.k->p.offset) {
-                       bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+                       bch_err(c, "snapshot parent %u missing pointer to child %llu",
                                id, s.k->p.offset);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
        }
 
        for (i = 0; i < 2 && s.v->children[i]; i++) {
                id = le32_to_cpu(s.v->children[i]);
 
-               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               ret = snapshot_lookup(trans, id, &v);
                if (ret == -ENOENT)
-                       bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+                       bch_err(c, "snapshot node %llu has nonexistent child %u",
                                s.k->p.offset, id);
                if (ret)
-                       return ret;
+                       goto err;
 
                if (le32_to_cpu(v.parent) != s.k->p.offset) {
-                       bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+                       bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
                                id, le32_to_cpu(v.parent), s.k->p.offset);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
        }
 
-       return 0;
+       should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
+               !BCH_SNAPSHOT_DELETED(s.v);
+
+       if (should_have_subvol) {
+               id = le32_to_cpu(s.v->subvol);
+               ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+               if (ret == -ENOENT)
+                       bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+               if (ret)
+                       goto err;
+
+               if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+                       bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+                               s.k->p.offset);
+                       ret = -EINVAL;
+                       goto err;
+               }
+       } else {
+               if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+                       struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
+
+                       ret = PTR_ERR_OR_ZERO(u);
+                       if (ret)
+                               goto err;
+
+                       bkey_reassemble(&u->k_i, s.s_c);
+                       u->v.subvol = 0;
+                       ret = bch2_trans_update(trans, iter, &u->k_i, 0);
+                       if (ret)
+                               goto err;
+               }
+       }
+
+       if (BCH_SNAPSHOT_DELETED(s.v))
+               set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
 }
 
-int bch2_fs_snapshots_check(struct bch_fs *c)
+int bch2_fs_check_snapshots(struct bch_fs *c)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bch_snapshot s;
-       unsigned id;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_snapshots, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_snapshot(&trans, &iter, k));
+
+       if (ret)
+               bch_err(c, "error %i checking snapshots", ret);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
 
-               ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+static int check_subvol(struct btree_trans *trans,
+                       struct btree_iter *iter,
+                       struct bkey_s_c k)
+{
+       struct bkey_s_c_subvolume subvol;
+       struct bch_snapshot snapshot;
+       unsigned snapid;
+       int ret;
+
+       if (k.k->type != KEY_TYPE_subvolume)
+               return 0;
+
+       subvol = bkey_s_c_to_subvolume(k);
+       snapid = le32_to_cpu(subvol.v->snapshot);
+       ret = snapshot_lookup(trans, snapid, &snapshot);
+
+       if (ret == -ENOENT)
+               bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+                       k.k->p.offset, snapid);
+       if (ret)
+               return ret;
+
+       if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+               ret = bch2_subvolume_delete(trans, iter->pos.offset);
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       bch_err(trans->c, "error deleting subvolume %llu: %s",
+                               iter->pos.offset, bch2_err_str(ret));
                if (ret)
-                       break;
+                       return ret;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       if (ret) {
-               bch_err(c, "error %i checking snapshots", ret);
-               goto err;
-       }
+       return 0;
+}
+
+int bch2_fs_check_subvols(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_subvol(&trans, &iter, k));
 
-       for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_subvolume)
-                       continue;
-again_2:
-               id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-               ret = snapshot_lookup(&trans, id, &s);
-
-               if (ret == -EINTR) {
-                       k = bch2_btree_iter_peek(&iter);
-                       goto again_2;
-               } else if (ret == -ENOENT)
-                       bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
-                               k.k->p.offset, id);
-               else if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-err:
        bch2_trans_exit(&trans);
+
        return ret;
 }
 
@@ -290,49 +354,19 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       bool have_deleted = false;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-              if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
-                      break;
-
-               if (k.k->type != KEY_TYPE_snapshot) {
-                       bch_err(c, "found wrong key type %u in snapshot node table",
-                               k.k->type);
-                       continue;
-               }
-
-               if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
-                       have_deleted = true;
-
-               ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+               bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+               bch2_snapshot_set_equiv(&trans, k));
 
-       if (ret)
-               goto err;
-
-       ret = bch2_snapshots_set_equiv(&trans);
-       if (ret)
-               goto err;
-err:
        bch2_trans_exit(&trans);
 
-       if (!ret && have_deleted) {
-               bch_info(c, "restarting deletion of dead snapshots");
-               if (c->opts.fsck) {
-                       bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
-               } else {
-                       bch2_delete_dead_snapshots(c);
-               }
-       }
-
+       if (ret)
+               bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -369,8 +403,10 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
                goto err;
 
        bkey_reassemble(&s->k_i, k);
-
        SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+       SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+       s->v.subvol = 0;
+
        ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
        if (ret)
                goto err;
@@ -481,7 +517,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                        goto err;
 
                if (!k.k || !k.k->p.offset) {
-                       ret = -ENOSPC;
+                       ret = -BCH_ERR_ENOSPC_snapshot_create;
                        goto err;
                }
 
@@ -534,6 +570,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
                n->v.children[0] = cpu_to_le32(new_snapids[0]);
                n->v.children[1] = cpu_to_le32(new_snapids[1]);
+               n->v.subvol = 0;
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
                ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
                if (ret)
@@ -544,141 +581,100 @@ err:
        return ret;
 }
 
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_delete_key(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c k,
+                              snapshot_id_list *deleted,
+                              snapshot_id_list *equiv_seen,
+                              struct bpos *last_pos)
 {
-       BUG_ON(snapshot_list_has_id(s, id));
-
-       if (s->nr == s->size) {
-               size_t new_size = max(8U, s->size * 2);
-               void *n = krealloc(s->d,
-                                  new_size * sizeof(s->d[0]),
-                                  GFP_KERNEL);
-               if (!n) {
-                       pr_err("error allocating snapshot ID list");
-                       return -ENOMEM;
-               }
+       struct bch_fs *c = trans->c;
+       u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
 
-               s->d    = n;
-               s->size = new_size;
-       };
+       if (bkey_cmp(k.k->p, *last_pos))
+               equiv_seen->nr = 0;
+       *last_pos = k.k->p;
 
-       s->d[s->nr++] = id;
-       return 0;
+       if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+           snapshot_list_has_id(equiv_seen, equiv)) {
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       } else {
+               return snapshot_list_add(c, equiv_seen, equiv);
+       }
 }
 
-static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-                                          struct snapshot_id_list *deleted,
-                                          enum btree_id btree_id)
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+                                         struct bkey_s_c k)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct snapshot_id_list equiv_seen = { 0 };
-       struct bpos last_pos = POS_MIN;
-       int ret = 0;
+       struct bkey_s_c_snapshot snap;
+       u32 children[2];
+       int ret;
 
-       /*
-        * XXX: We should also delete whiteouts that no longer overwrite
-        * anything
-        */
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
-       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       while ((bch2_trans_begin(trans),
-               (k = bch2_btree_iter_peek(&iter)).k) &&
-              !(ret = bkey_err(k))) {
-               u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-
-               if (bkey_cmp(k.k->p, last_pos))
-                       equiv_seen.nr = 0;
-               last_pos = k.k->p;
-
-               if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-                   snapshot_list_has_id(&equiv_seen, equiv)) {
-                       if (btree_id == BTREE_ID_inodes &&
-                           bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
-                               continue;
-
-                       ret = __bch2_trans_do(trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL,
-                               bch2_btree_iter_traverse(&iter) ?:
-                               bch2_btree_delete_at(trans, &iter,
-                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-                       if (ret)
-                               break;
-               } else {
-                       ret = snapshot_id_add(&equiv_seen, equiv);
-                       if (ret)
-                               break;
-               }
+       snap = bkey_s_c_to_snapshot(k);
+       if (BCH_SNAPSHOT_DELETED(snap.v) ||
+           BCH_SNAPSHOT_SUBVOL(snap.v))
+               return 0;
 
-               bch2_btree_iter_advance(&iter);
-       }
-       bch2_trans_iter_exit(trans, &iter);
+       children[0] = le32_to_cpu(snap.v->children[0]);
+       children[1] = le32_to_cpu(snap.v->children[1]);
 
-       kfree(equiv_seen.d);
+       ret   = snapshot_live(trans, children[0]) ?:
+               snapshot_live(trans, children[1]);
+       if (ret < 0)
+               return ret;
 
-       return ret;
+       if (!ret)
+               return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+       return 0;
 }
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_snapshot snap;
-       struct snapshot_id_list deleted = { 0 };
-       u32 i, id, children[2];
+       snapshot_id_list deleted = { 0 };
+       u32 i, id;
        int ret = 0;
 
+       if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+               return 0;
+
+       if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+               ret = bch2_fs_read_write_early(c);
+               if (ret) {
+                       bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+                       return ret;
+               }
+       }
+
        bch2_trans_init(&trans, c, 0, 0);
 
        /*
         * For every snapshot node: If we have no live children and it's not
         * pointed to by a subvolume, delete it:
         */
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
-
-               snap = bkey_s_c_to_snapshot(k);
-               if (BCH_SNAPSHOT_DELETED(snap.v) ||
-                   BCH_SNAPSHOT_SUBVOL(snap.v))
-                       continue;
-
-               children[0] = le32_to_cpu(snap.v->children[0]);
-               children[1] = le32_to_cpu(snap.v->children[1]);
-
-               ret   = snapshot_live(&trans, children[0]) ?:
-                       snapshot_live(&trans, children[1]);
-               if (ret < 0)
-                       break;
-               if (ret)
-                       continue;
-
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
-               if (ret) {
-                       bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+                       POS_MIN, 0, k,
+                       NULL, NULL, 0,
+               bch2_delete_redundant_snapshot(&trans, &iter, k));
        if (ret) {
-               bch_err(c, "error walking snapshots: %i", ret);
+               bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
                goto err;
        }
 
-       ret = bch2_snapshots_set_equiv(&trans);
-       if (ret)
+       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+               bch2_snapshot_set_equiv(&trans, k));
+       if (ret) {
+               bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
                goto err;
+       }
 
        for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k, ret) {
@@ -687,7 +683,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 
                snap = bkey_s_c_to_snapshot(k);
                if (BCH_SNAPSHOT_DELETED(snap.v)) {
-                       ret = snapshot_id_add(&deleted, k.k->p.offset);
+                       ret = snapshot_list_add(c, &deleted, k.k->p.offset);
                        if (ret)
                                break;
                }
@@ -695,39 +691,59 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
        bch2_trans_iter_exit(&trans, &iter);
 
        if (ret) {
-               bch_err(c, "error walking snapshots: %i", ret);
+               bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
                goto err;
        }
 
        for (id = 0; id < BTREE_ID_NR; id++) {
+               struct bpos last_pos = POS_MIN;
+               snapshot_id_list equiv_seen = { 0 };
+
                if (!btree_type_has_snapshots(id))
                        continue;
 
-               ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+               ret = for_each_btree_key_commit(&trans, iter,
+                               id, POS_MIN,
+                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                       snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+               darray_exit(&equiv_seen);
+
                if (ret) {
-                       bch_err(c, "error deleting snapshot keys: %i", ret);
+                       bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
                        goto err;
                }
        }
 
        for (i = 0; i < deleted.nr; i++) {
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, deleted.d[i]));
+               ret = commit_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(&trans, deleted.data[i]));
                if (ret) {
-                       bch_err(c, "error deleting snapshot %u: %i",
-                               deleted.d[i], ret);
+                       bch_err(c, "error deleting snapshot %u: %s",
+                               deleted.data[i], bch2_err_str(ret));
                        goto err;
                }
        }
+
+       clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 err:
-       kfree(deleted.d);
+       darray_exit(&deleted);
        bch2_trans_exit(&trans);
+       return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+       bch2_delete_dead_snapshots(c);
        percpu_ref_put(&c->writes);
 }
 
-static void bch2_delete_dead_snapshots(struct bch_fs *c)
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
+       if (!percpu_ref_tryget_live(&c->writes))
                return;
 
        if (!queue_work(system_long_wq, &c->snapshot_delete_work))
@@ -737,24 +753,35 @@ static void bch2_delete_dead_snapshots(struct bch_fs *c)
 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
                                           struct btree_trans_commit_hook *h)
 {
-       bch2_delete_dead_snapshots(trans->c);
+       struct bch_fs *c = trans->c;
+
+       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+       if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+               return 0;
+
+       bch2_delete_dead_snapshots_async(c);
        return 0;
 }
 
 /* Subvolumes: */
 
-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
-       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
-               return "invalid pos";
-
-       if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
-               return "invalid pos";
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
+           bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
+               prt_printf(err, "invalid pos");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
-               return "bad val size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
@@ -762,7 +789,7 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
 
-       pr_buf(out, "root %llu snapshot id %u",
+       prt_printf(out, "root %llu snapshot id %u",
               le64_to_cpu(s.v->inode),
               le32_to_cpu(s.v->snapshot));
 }
@@ -824,7 +851,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
        struct bkey_s_c k;
        struct bkey_s_c_subvolume subvol;
        struct btree_trans_commit_hook *h;
-       struct bkey_i *delete;
        u32 snapid;
        int ret = 0;
 
@@ -846,19 +872,14 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
        subvol = bkey_s_c_to_subvolume(k);
        snapid = le32_to_cpu(subvol.v->snapshot);
 
-       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-       ret = PTR_ERR_OR_ZERO(delete);
+       ret = bch2_btree_delete_at(trans, &iter, 0);
        if (ret)
                goto err;
 
-       bkey_init(&delete->k);
-       delete->k.p = iter.pos;
-       ret = bch2_trans_update(trans, &iter, delete, 0);
+       ret = bch2_snapshot_node_set_deleted(trans, snapid);
        if (ret)
                goto err;
 
-       ret = bch2_snapshot_node_set_deleted(trans, snapid);
-
        h = bch2_trans_kmalloc(trans, sizeof(*h));
        ret = PTR_ERR_OR_ZERO(h);
        if (ret)
@@ -875,14 +896,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs,
                                snapshot_wait_for_pagecache_and_delete_work);
-       struct snapshot_id_list s;
+       snapshot_id_list s;
        u32 *id;
        int ret = 0;
 
        while (!ret) {
                mutex_lock(&c->snapshots_unlinked_lock);
                s = c->snapshots_unlinked;
-               memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+               darray_init(&c->snapshots_unlinked);
                mutex_unlock(&c->snapshots_unlinked_lock);
 
                if (!s.nr)
@@ -890,16 +911,16 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 
                bch2_evict_subvolume_inodes(c, &s);
 
-               for (id = s.d; id < s.d + s.nr; id++) {
+               for (id = s.data; id < s.data + s.nr; id++) {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                      bch2_subvolume_delete(&trans, *id));
                        if (ret) {
-                               bch_err(c, "error %i deleting subvolume %u", ret, *id);
+                               bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
                                break;
                        }
                }
 
-               kfree(s.d);
+               darray_exit(&s);
        }
 
        percpu_ref_put(&c->writes);
@@ -919,13 +940,13 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 
        mutex_lock(&c->snapshots_unlinked_lock);
        if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-               ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+               ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
        mutex_unlock(&c->snapshots_unlinked_lock);
 
        if (ret)
                return ret;
 
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
+       if (unlikely(!percpu_ref_tryget_live(&c->writes)))
                return -EROFS;
 
        if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
@@ -1010,7 +1031,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
        }
 
        if (!ret)
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_subvolume_create;
        goto err;
 found_slot:
        snapshot_subvols[0] = dst_iter.pos.offset;
index 4abe53df2788466f58fe347f4611a2370fcf1f60..02a636644988a4ba51327c071b20906821cc8f93 100644 (file)
@@ -2,10 +2,12 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "darray.h"
 #include "subvolume_types.h"
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+                         int rw, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot (struct bkey_ops) {             \
        .key_invalid    = bch2_snapshot_invalid,                \
@@ -25,6 +27,16 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
        return snapshot_t(c, id)->parent;
 }
 
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+       return snapshot_t(c, id)->equiv;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+       return id == snapshot_t(c, id)->equiv;
+}
+
 static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
 {
        struct snapshot_t *s = snapshot_t(c, id);
@@ -56,59 +68,45 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
        return id == ancestor;
 }
 
-struct snapshots_seen {
-       struct bpos                     pos;
-       size_t                          nr;
-       size_t                          size;
-       u32                             *d;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
-       kfree(s->d);
-       s->d = NULL;
-}
+       u32 *i;
 
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
-       memset(s, 0, sizeof(*s));
+       darray_for_each(*s, i)
+               if (*i == id)
+                       return true;
+       return false;
 }
 
-static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
 {
-       if (s->nr == s->size) {
-               size_t new_size = max(s->size, (size_t) 128) * 2;
-               u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
-               if (!d) {
-                       bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
-                               new_size);
-                       return -ENOMEM;
-               }
-
-               s->size = new_size;
-               s->d    = d;
-       }
+       u32 *i;
 
-       s->d[s->nr++] = id;
-       return 0;
+       darray_for_each(*s, i)
+               if (bch2_snapshot_is_ancestor(c, id, *i))
+                       return true;
+       return false;
 }
 
-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
 {
-       unsigned i;
+       int ret;
 
-       for (i = 0; i < s->nr; i++)
-               if (id == s->d[i])
-                       return true;
-       return false;
+       BUG_ON(snapshot_list_has_id(s, id));
+       ret = darray_push(s, id);
+       if (ret)
+               bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+       return ret;
 }
 
-int bch2_fs_snapshots_check(struct bch_fs *);
+int bch2_fs_check_snapshots(struct bch_fs *);
+int bch2_fs_check_subvols(struct bch_fs *);
+
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
-const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+                          int rw, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume (struct bkey_ops) {            \
@@ -126,6 +124,9 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 int bch2_snapshot_node_create(struct btree_trans *, u32,
                              u32 *, u32 *, unsigned);
 
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
 int bch2_subvolume_delete(struct btree_trans *, u32);
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
index 9410b9587591e8c479765c911b3a586e5a6d0bef..f7562b5d51dff4ba1c07e52857fe793bcc75a794 100644 (file)
@@ -2,10 +2,8 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-struct snapshot_id_list {
-       u32             nr;
-       u32             size;
-       u32             *d;
-};
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
 
 #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
index 49dafdad77cd9d4bb7990b9f73687a754970d808..60c1f03c05af48bd7badfa39e157a53b0bf1cc41 100644 (file)
 #include "io.h"
 #include "journal.h"
 #include "journal_io.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
 #include "super-io.h"
 #include "super.h"
 #include "vstructs.h"
+#include "counters.h"
 
 #include <linux/backing-dev.h>
+#include <linux/pretty-printers.h>
 #include <linux/sort.h>
 
+#include <trace/events/bcachefs.h>
+
 const char * const bch2_sb_fields[] = {
 #define x(name, nr)    #name,
        BCH_SB_FIELDS()
@@ -95,8 +100,7 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
 
 void bch2_free_super(struct bch_sb_handle *sb)
 {
-       if (sb->bio)
-               bio_put(sb->bio);
+       kfree(sb->bio);
        if (!IS_ERR_OR_NULL(sb->bdev))
                blkdev_put(sb->bdev, sb->mode);
 
@@ -123,11 +127,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
                u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
                if (new_bytes > max_bytes) {
-                       char buf[BDEVNAME_SIZE];
-
-                       pr_err("%s: superblock too big: want %zu but have %llu",
-                              bdevname(sb->bdev, buf), new_bytes, max_bytes);
-                       return -ENOSPC;
+                       pr_err("%pg: superblock too big: want %zu but have %llu",
+                              sb->bdev, new_bytes, max_bytes);
+                       return -BCH_ERR_ENOSPC_sb;
                }
        }
 
@@ -138,13 +140,15 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
                return -ENOMEM;
 
        if (sb->have_bio) {
-               bio = bio_kmalloc(GFP_KERNEL,
-                       DIV_ROUND_UP(new_buffer_size, PAGE_SIZE));
+               unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+
+               bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
                if (!bio)
                        return -ENOMEM;
 
-               if (sb->bio)
-                       bio_put(sb->bio);
+               bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+               kfree(sb->bio);
                sb->bio = bio;
        }
 
@@ -208,23 +212,23 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
        unsigned i;
 
        if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
-               pr_buf(out, "Not a bcachefs superblock layout");
+               prt_printf(out, "Not a bcachefs superblock layout");
                return -EINVAL;
        }
 
        if (layout->layout_type != 0) {
-               pr_buf(out, "Invalid superblock layout type %u",
+               prt_printf(out, "Invalid superblock layout type %u",
                       layout->layout_type);
                return -EINVAL;
        }
 
        if (!layout->nr_superblocks) {
-               pr_buf(out, "Invalid superblock layout: no superblocks");
+               prt_printf(out, "Invalid superblock layout: no superblocks");
                return -EINVAL;
        }
 
        if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
-               pr_buf(out, "Invalid superblock layout: too many superblocks");
+               prt_printf(out, "Invalid superblock layout: too many superblocks");
                return -EINVAL;
        }
 
@@ -236,7 +240,7 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
                offset = le64_to_cpu(layout->sb_offset[i]);
 
                if (offset < prev_offset + max_sectors) {
-                       pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+                       prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
                               "  (sb %u ends at %llu next starts at %llu",
                               i - 1, prev_offset + max_sectors, offset);
                        return -EINVAL;
@@ -247,82 +251,111 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
        return 0;
 }
 
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+                           int rw)
 {
        struct bch_sb *sb = disk_sb->sb;
        struct bch_sb_field *f;
        struct bch_sb_field_members *mi;
+       enum bch_opt_id opt_id;
        u32 version, version_min;
        u16 block_size;
        int ret;
 
        version         = le16_to_cpu(sb->version);
-       version_min     = version >= bcachefs_metadata_version_new_versioning
+       version_min     = version >= bcachefs_metadata_version_bkey_renumber
                ? le16_to_cpu(sb->version_min)
                : version;
 
        if (version    >= bcachefs_metadata_version_max) {
-               pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+               prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
                       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
                return -EINVAL;
        }
 
        if (version_min < bcachefs_metadata_version_min) {
-               pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+               prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
                       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
                return -EINVAL;
        }
 
        if (version_min > version) {
-               pr_buf(out, "Bad minimum version %u, greater than version field %u",
+               prt_printf(out, "Bad minimum version %u, greater than version field %u",
                       version_min, version);
                return -EINVAL;
        }
 
        if (sb->features[1] ||
            (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
-               pr_buf(out, "Filesystem has incompatible features");
+               prt_printf(out, "Filesystem has incompatible features");
                return -EINVAL;
        }
 
        block_size = le16_to_cpu(sb->block_size);
 
        if (block_size > PAGE_SECTORS) {
-               pr_buf(out, "Block size too big (got %u, max %u)",
+               prt_printf(out, "Block size too big (got %u, max %u)",
                       block_size, PAGE_SECTORS);
                return -EINVAL;
        }
 
        if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
-               pr_buf(out, "Bad user UUID (got zeroes)");
+               prt_printf(out, "Bad user UUID (got zeroes)");
                return -EINVAL;
        }
 
        if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
-               pr_buf(out, "Bad intenal UUID (got zeroes)");
+               prt_printf(out, "Bad intenal UUID (got zeroes)");
                return -EINVAL;
        }
 
        if (!sb->nr_devices ||
            sb->nr_devices > BCH_SB_MEMBERS_MAX) {
-               pr_buf(out, "Bad number of member devices %u (max %u)",
+               prt_printf(out, "Bad number of member devices %u (max %u)",
                       sb->nr_devices, BCH_SB_MEMBERS_MAX);
                return -EINVAL;
        }
 
        if (sb->dev_idx >= sb->nr_devices) {
-               pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+               prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
                       sb->dev_idx, sb->nr_devices);
                return -EINVAL;
        }
 
        if (!sb->time_precision ||
            le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
-               pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+               prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
                       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
                return -EINVAL;
        }
 
+       if (rw == READ) {
+               /*
+                * Been seeing a bug where these are getting inexplicably
+                * zeroed, so we'r now validating them, but we have to be
+                * careful not to preven people's filesystems from mounting:
+                */
+               if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+                       SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+               if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+                       SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+       }
+
+       for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+               const struct bch_option *opt = bch2_opt_table + opt_id;
+
+               if (opt->get_sb != BCH2_NO_SB_OPT) {
+                       u64 v = bch2_opt_from_sb(sb, opt_id);
+
+                       prt_printf(out, "Invalid option ");
+                       ret = bch2_opt_validate(opt, v, out);
+                       if (ret)
+                               return ret;
+
+                       printbuf_reset(out);
+               }
+       }
+
        /* validate layout */
        ret = validate_sb_layout(&sb->layout, out);
        if (ret)
@@ -330,13 +363,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 
        vstruct_for_each(sb, f) {
                if (!f->u64s) {
-                       pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+                       prt_printf(out, "Invalid superblock: optional with size 0 (type %u)",
                               le32_to_cpu(f->type));
                        return -EINVAL;
                }
 
                if (vstruct_next(f) > vstruct_last(sb)) {
-                       pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+                       prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
                               le32_to_cpu(f->type));
                        return -EINVAL;
                }
@@ -345,7 +378,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
        /* members must be validated first: */
        mi = bch2_sb_get_members(sb);
        if (!mi) {
-               pr_buf(out, "Invalid superblock: member info area missing");
+               prt_printf(out, "Invalid superblock: member info area missing");
                return -EINVAL;
        }
 
@@ -424,7 +457,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
        memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
 
        for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-               if (i == BCH_SB_FIELD_journal)
+               if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
                        continue;
 
                src_f = bch2_sb_field_get(src, i);
@@ -455,9 +488,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
        __copy_super(&c->disk_sb, src);
 
-       if (BCH_SB_INITIALIZED(c->disk_sb.sb))
-               set_bit(BCH_FS_INITIALIZED, &c->flags);
-
        ret = bch2_sb_replicas_to_cpu_replicas(c);
        if (ret)
                return ret;
@@ -498,36 +528,34 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf
        size_t bytes;
        int ret;
 reread:
-       bio_reset(sb->bio);
-       bio_set_dev(sb->bio, sb->bdev);
+       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
        sb->bio->bi_iter.bi_sector = offset;
-       bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
        bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 
        ret = submit_bio_wait(sb->bio);
        if (ret) {
-               pr_buf(err, "IO error: %i", ret);
+               prt_printf(err, "IO error: %i", ret);
                return ret;
        }
 
        if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
-               pr_buf(err, "Not a bcachefs superblock");
+               prt_printf(err, "Not a bcachefs superblock");
                return -EINVAL;
        }
 
        version         = le16_to_cpu(sb->sb->version);
-       version_min     = version >= bcachefs_metadata_version_new_versioning
+       version_min     = version >= bcachefs_metadata_version_bkey_renumber
                ? le16_to_cpu(sb->sb->version_min)
                : version;
 
        if (version    >= bcachefs_metadata_version_max) {
-               pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+               prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
                       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
                return -EINVAL;
        }
 
        if (version_min < bcachefs_metadata_version_min) {
-               pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+               prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
                       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
                return -EINVAL;
        }
@@ -535,7 +563,7 @@ reread:
        bytes = vstruct_bytes(sb->sb);
 
        if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
-               pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+               prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
                       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
                return -EINVAL;
        }
@@ -547,7 +575,7 @@ reread:
        }
 
        if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
-               pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+               prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
                return -EINVAL;
        }
 
@@ -556,7 +584,7 @@ reread:
                            null_nonce(), sb->sb);
 
        if (bch2_crc_cmp(csum, sb->sb->csum)) {
-               pr_buf(err, "bad checksum");
+               prt_printf(err, "bad checksum");
                return -EINVAL;
        }
 
@@ -570,16 +598,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 {
        u64 offset = opt_get(*opts, sb);
        struct bch_sb_layout layout;
-       char *_err;
-       struct printbuf err;
+       struct printbuf err = PRINTBUF;
        __le64 *i;
        int ret;
 
-       _err = kmalloc(4096, GFP_KERNEL);
-       if (!_err)
-               return -ENOMEM;
-       err = _PBUF(_err, 4096);
-
        pr_verbose_init(*opts, "");
 
        memset(sb, 0, sizeof(*sb));
@@ -610,12 +632,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 
        ret = bch2_sb_realloc(sb, 0);
        if (ret) {
-               pr_buf(&err, "error allocating memory for superblock");
+               prt_printf(&err, "error allocating memory for superblock");
                goto err;
        }
 
        if (bch2_fs_init_fault("read_super")) {
-               pr_buf(&err, "dynamic fault");
+               prt_printf(&err, "dynamic fault");
                ret = -EFAULT;
                goto err;
        }
@@ -628,17 +650,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
                goto err;
 
        printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
-              path, _err);
-       err = _PBUF(_err, 4096);
+              path, err.buf);
+       printbuf_reset(&err);
 
        /*
         * Error reading primary superblock - read location of backup
         * superblocks:
         */
-       bio_reset(sb->bio);
-       bio_set_dev(sb->bio, sb->bdev);
+       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
        sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-       bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
        /*
         * use sb buffer to read layout, since sb buffer is page aligned but
         * layout won't be:
@@ -647,7 +667,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 
        ret = submit_bio_wait(sb->bio);
        if (ret) {
-               pr_buf(&err, "IO error: %i", ret);
+               prt_printf(&err, "IO error: %i", ret);
                goto err;
        }
 
@@ -673,7 +693,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 got_super:
        if (le16_to_cpu(sb->sb->block_size) << 9 <
            bdev_logical_block_size(sb->bdev)) {
-               pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+               prt_printf(&err, "block size (%u) smaller than device block size (%u)",
                       le16_to_cpu(sb->sb->block_size) << 9,
                       bdev_logical_block_size(sb->bdev));
                ret = -EINVAL;
@@ -683,19 +703,19 @@ got_super:
        ret = 0;
        sb->have_layout = true;
 
-       ret = bch2_sb_validate(sb, &err);
+       ret = bch2_sb_validate(sb, &err, READ);
        if (ret) {
                printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
-                      path, _err);
+                      path, err.buf);
                goto err_no_print;
        }
 out:
        pr_verbose_init(*opts, "ret %i", ret);
-       kfree(_err);
+       printbuf_exit(&err);
        return ret;
 err:
        printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
-              path, _err);
+              path, err.buf);
 err_no_print:
        bch2_free_super(sb);
        goto out;
@@ -722,12 +742,10 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
        struct bch_sb *sb = ca->disk_sb.sb;
        struct bio *bio = ca->disk_sb.bio;
 
-       bio_reset(bio);
-       bio_set_dev(bio, ca->disk_sb.bdev);
+       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
        bio->bi_iter.bi_sector  = le64_to_cpu(sb->layout.sb_offset[0]);
        bio->bi_end_io          = write_super_endio;
        bio->bi_private         = ca;
-       bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
        bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
 
        this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
@@ -748,12 +766,10 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
        sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
                                null_nonce(), sb);
 
-       bio_reset(bio);
-       bio_set_dev(bio, ca->disk_sb.bdev);
+       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
        bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
        bio->bi_end_io          = write_super_endio;
        bio->bi_private         = ca;
-       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
        bch2_bio_map(bio, sb,
                     roundup((size_t) vstruct_bytes(sb),
                             bdev_logical_block_size(ca->disk_sb.bdev)));
@@ -769,12 +785,15 @@ int bch2_write_super(struct bch_fs *c)
 {
        struct closure *cl = &c->sb_write;
        struct bch_dev *ca;
+       struct printbuf err = PRINTBUF;
        unsigned i, sb = 0, nr_wrote;
        struct bch_devs_mask sb_written;
        bool wrote, can_mount_without_written, can_mount_with_written;
        unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
        int ret = 0;
 
+       trace_and_count(c, write_super, c, _RET_IP_);
+
        if (c->opts.very_degraded)
                degraded_flags |= BCH_FORCE_IF_LOST;
 
@@ -792,22 +811,17 @@ int bch2_write_super(struct bch_fs *c)
 
        SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
+       bch2_sb_counters_from_cpu(c);
+
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
 
        for_each_online_member(ca, c, i) {
-               struct printbuf buf = { NULL, NULL };
+               printbuf_reset(&err);
 
-               ret = bch2_sb_validate(&ca->disk_sb, &buf);
+               ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
                if (ret) {
-                       char *_buf = kmalloc(4096, GFP_NOFS);
-                       if (_buf) {
-                               buf = _PBUF(_buf, 4096);
-                               bch2_sb_validate(&ca->disk_sb, &buf);
-                       }
-
-                       bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
-                       kfree(_buf);
+                       bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
                        percpu_ref_put(&ca->io_ref);
                        goto out;
                }
@@ -816,6 +830,13 @@ int bch2_write_super(struct bch_fs *c)
        if (c->opts.nochanges)
                goto out;
 
+       /*
+        * Defer writing the superblock until filesystem initialization is
+        * complete - don't write out a partly initialized superblock:
+        */
+       if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+               goto out;
+
        for_each_online_member(ca, c, i) {
                __set_bit(ca->dev_idx, sb_written.d);
                ca->sb_write_error = 0;
@@ -898,6 +919,7 @@ int bch2_write_super(struct bch_fs *c)
 out:
        /* Make new options visible after they're persistent: */
        bch2_sb_update(c);
+       printbuf_exit(&err);
        return ret;
 }
 
@@ -912,75 +934,9 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
        mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
-       u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
-       return l < r ? -1 : l > r ? 1 : 0;
-}
-
-static int bch2_sb_validate_journal(struct bch_sb *sb,
-                                   struct bch_sb_field *f,
-                                   struct printbuf *err)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-       int ret = -EINVAL;
-       unsigned nr;
-       unsigned i;
-       u64 *b;
-
-       nr = bch2_nr_journal_buckets(journal);
-       if (!nr)
-               return 0;
-
-       b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
-       if (!b)
-               return -ENOMEM;
-
-       for (i = 0; i < nr; i++)
-               b[i] = le64_to_cpu(journal->buckets[i]);
-
-       sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-       if (!b[0]) {
-               pr_buf(err, "journal bucket at sector 0");
-               goto err;
-       }
-
-       if (b[0] < le16_to_cpu(m->first_bucket)) {
-               pr_buf(err, "journal bucket %llu before first bucket %u",
-                      b[0], le16_to_cpu(m->first_bucket));
-               goto err;
-       }
-
-       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
-               pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1], le64_to_cpu(m->nbuckets));
-               goto err;
-       }
-
-       for (i = 0; i + 1 < nr; i++)
-               if (b[i] == b[i + 1]) {
-                       pr_buf(err, "duplicate journal buckets %llu", b[i]);
-                       goto err;
-               }
-
-       ret = 0;
-err:
-       kfree(b);
-       return ret;
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-       .validate       = bch2_sb_validate_journal,
-};
-
 /* BCH_SB_FIELD_members: */
 
-static int bch2_sb_validate_members(struct bch_sb *sb,
+static int bch2_sb_members_validate(struct bch_sb *sb,
                                    struct bch_sb_field *f,
                                    struct printbuf *err)
 {
@@ -989,7 +945,7 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
 
        if ((void *) (mi->members + sb->nr_devices) >
            vstruct_end(&mi->field)) {
-               pr_buf(err, "too many devices for section size");
+               prt_printf(err, "too many devices for section size");
                return -EINVAL;
        }
 
@@ -1000,28 +956,28 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
                        continue;
 
                if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-                       pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+                       prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
                               i, le64_to_cpu(m->nbuckets), LONG_MAX);
                        return -EINVAL;
                }
 
                if (le64_to_cpu(m->nbuckets) -
                    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-                       pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+                       prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
                               i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
                        return -EINVAL;
                }
 
                if (le16_to_cpu(m->bucket_size) <
                    le16_to_cpu(sb->block_size)) {
-                       pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+                       prt_printf(err, "device %u: bucket size %u smaller than block size %u",
                               i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
                        return -EINVAL;
                }
 
                if (le16_to_cpu(m->bucket_size) <
                    BCH_SB_BTREE_NODE_SIZE(sb)) {
-                       pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+                       prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
                               i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
                        return -EINVAL;
                }
@@ -1030,39 +986,165 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
        return 0;
 }
 
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_members *mi = field_to_type(f, members);
+       struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+       unsigned i;
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+               unsigned data_have = bch2_sb_dev_has_data(sb, i);
+               u64 bucket_size = le16_to_cpu(m->bucket_size);
+               u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               prt_printf(out, "Device:");
+               prt_tab(out);
+               prt_printf(out, "%u", i);
+               prt_newline(out);
+
+               printbuf_indent_add(out, 2);
+
+               prt_printf(out, "UUID:");
+               prt_tab(out);
+               pr_uuid(out, m->uuid.b);
+               prt_newline(out);
+
+               prt_printf(out, "Size:");
+               prt_tab(out);
+               prt_units_u64(out, device_size << 9);
+               prt_newline(out);
+
+               prt_printf(out, "Bucket size:");
+               prt_tab(out);
+               prt_units_u64(out, bucket_size << 9);
+               prt_newline(out);
+
+               prt_printf(out, "First bucket:");
+               prt_tab(out);
+               prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+               prt_newline(out);
+
+               prt_printf(out, "Buckets:");
+               prt_tab(out);
+               prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+               prt_newline(out);
+
+               prt_printf(out, "Last mount:");
+               prt_tab(out);
+               if (m->last_mount)
+                       pr_time(out, le64_to_cpu(m->last_mount));
+               else
+                       prt_printf(out, "(never)");
+               prt_newline(out);
+
+               prt_printf(out, "State:");
+               prt_tab(out);
+               prt_printf(out, "%s",
+                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
+                      : "unknown");
+               prt_newline(out);
+
+               prt_printf(out, "Label:");
+               prt_tab(out);
+               if (BCH_MEMBER_GROUP(m)) {
+                       unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+                       if (idx < disk_groups_nr(gi))
+                               prt_printf(out, "%s (%u)",
+                                      gi->entries[idx].label, idx);
+                       else
+                               prt_printf(out, "(bad disk labels section)");
+               } else {
+                       prt_printf(out, "(none)");
+               }
+               prt_newline(out);
+
+               prt_printf(out, "Data allowed:");
+               prt_tab(out);
+               if (BCH_MEMBER_DATA_ALLOWED(m))
+                       prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
+               else
+                       prt_printf(out, "(none)");
+               prt_newline(out);
+
+               prt_printf(out, "Has data:");
+               prt_tab(out);
+               if (data_have)
+                       prt_bitflags(out, bch2_data_types, data_have);
+               else
+                       prt_printf(out, "(none)");
+               prt_newline(out);
+
+               prt_printf(out, "Discard:");
+               prt_tab(out);
+               prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+               prt_newline(out);
+
+               prt_printf(out, "Freespace initialized:");
+               prt_tab(out);
+               prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+               prt_newline(out);
+
+               printbuf_indent_sub(out, 2);
+       }
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-       .validate       = bch2_sb_validate_members,
+       .validate       = bch2_sb_members_validate,
+       .to_text        = bch2_sb_members_to_text,
 };
 
 /* BCH_SB_FIELD_crypt: */
 
-static int bch2_sb_validate_crypt(struct bch_sb *sb,
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
                                  struct bch_sb_field *f,
                                  struct printbuf *err)
 {
        struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
        if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-               pr_buf(err, "wrong size (got %llu should be %zu)",
+               prt_printf(err, "wrong size (got %zu should be %zu)",
                       vstruct_bytes(&crypt->field), sizeof(*crypt));
                return -EINVAL;
        }
 
        if (BCH_CRYPT_KDF_TYPE(crypt)) {
-               pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
                return -EINVAL;
        }
 
        return 0;
 }
 
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+       prt_newline(out);
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-       .validate       = bch2_sb_validate_crypt,
+       .validate       = bch2_sb_crypt_validate,
+       .to_text        = bch2_sb_crypt_to_text,
 };
 
 /* BCH_SB_FIELD_clean: */
 
-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
 {
        struct jset_entry *entry;
        int ret;
@@ -1070,7 +1152,7 @@ int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, i
        for (entry = clean->start;
             entry < (struct jset_entry *) vstruct_end(&clean->field);
             entry = vstruct_next(entry)) {
-               ret = bch2_journal_entry_validate(c, "superblock", entry,
+               ret = bch2_journal_entry_validate(c, NULL, entry,
                                                  le16_to_cpu(c->disk_sb.sb->version),
                                                  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
                                                  write);
@@ -1185,7 +1267,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
                u->entry.type = BCH_JSET_ENTRY_dev_usage;
                u->dev = cpu_to_le32(dev);
                u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
-               u->buckets_unavailable  = cpu_to_le64(ca->usage_base->buckets_unavailable);
 
                for (i = 0; i < BCH_DATA_NR; i++) {
                        u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
@@ -1234,7 +1315,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        }
 
        sb_clean->flags         = 0;
-       sb_clean->journal_seq   = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
 
        /* Trying to catch outstanding bug: */
        BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
@@ -1251,7 +1332,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
         * this should be in the write path, and we should be validating every
         * superblock section:
         */
-       ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
        if (ret) {
                bch_err(c, "error writing marking filesystem clean: validate error");
                goto out;
@@ -1262,14 +1343,14 @@ out:
        mutex_unlock(&c->sb_lock);
 }
 
-static int bch2_sb_validate_clean(struct bch_sb *sb,
+static int bch2_sb_clean_validate(struct bch_sb *sb,
                                  struct bch_sb_field *f,
                                  struct printbuf *err)
 {
        struct bch_sb_field_clean *clean = field_to_type(f, clean);
 
        if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-               pr_buf(err, "wrong size (got %llu should be %zu)",
+               prt_printf(err, "wrong size (got %zu should be %zu)",
                       vstruct_bytes(&clean->field), sizeof(*clean));
                return -EINVAL;
        }
@@ -1277,8 +1358,32 @@ static int bch2_sb_validate_clean(struct bch_sb *sb,
        return 0;
 }
 
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+       struct jset_entry *entry;
+
+       prt_printf(out, "flags:          %x",   le32_to_cpu(clean->flags));
+       prt_newline(out);
+       prt_printf(out, "journal_seq:    %llu", le64_to_cpu(clean->journal_seq));
+       prt_newline(out);
+
+       for (entry = clean->start;
+            entry != vstruct_end(&clean->field);
+            entry = vstruct_next(entry)) {
+               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+                   !entry->u64s)
+                       continue;
+
+               bch2_journal_entry_to_text(out, NULL, entry);
+               prt_newline(out);
+       }
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-       .validate       = bch2_sb_validate_clean,
+       .validate       = bch2_sb_clean_validate,
+       .to_text        = bch2_sb_clean_to_text,
 };
 
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
@@ -1289,24 +1394,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 };
 
 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 struct printbuf *orig_err)
+                                 struct printbuf *err)
 {
        unsigned type = le32_to_cpu(f->type);
-       struct printbuf err = *orig_err;
+       struct printbuf field_err = PRINTBUF;
        int ret;
 
        if (type >= BCH_SB_FIELD_NR)
                return 0;
 
-       pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
-
-       ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+       ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
        if (ret) {
-               pr_buf(&err, "\n");
-               bch2_sb_field_to_text(&err, sb, f);
-               *orig_err = err;
+               prt_printf(err, "Invalid superblock section %s: %s",
+                      bch2_sb_fields[type],
+                      field_err.buf);
+               prt_newline(err);
+               bch2_sb_field_to_text(err, sb, f);
        }
 
+       printbuf_exit(&field_err);
        return ret;
 }
 
@@ -1317,13 +1423,179 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
        const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
                ? bch2_sb_field_ops[type] : NULL;
 
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 32);
+
        if (ops)
-               pr_buf(out, "%s", bch2_sb_fields[type]);
+               prt_printf(out, "%s", bch2_sb_fields[type]);
        else
-               pr_buf(out, "(unknown field %u)", type);
+               prt_printf(out, "(unknown field %u)", type);
 
-       pr_buf(out, " (size %llu):", vstruct_bytes(f));
+       prt_printf(out, " (size %zu):", vstruct_bytes(f));
+       prt_newline(out);
 
-       if (ops && ops->to_text)
+       if (ops && ops->to_text) {
+               printbuf_indent_add(out, 2);
                bch2_sb_field_ops[type]->to_text(out, sb, f);
+               printbuf_indent_sub(out, 2);
+       }
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+       unsigned i;
+
+       prt_printf(out, "Type:                    %u", l->layout_type);
+       prt_newline(out);
+
+       prt_str(out, "Superblock max size:     ");
+       prt_units_u64(out, 512 << l->sb_max_size_bits);
+       prt_newline(out);
+
+       prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
+       prt_newline(out);
+
+       prt_str(out, "Offsets:                 ");
+       for (i = 0; i < l->nr_superblocks; i++) {
+               if (i)
+                       prt_str(out, ", ");
+               prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+       }
+       prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+                    bool print_layout, unsigned fields)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_sb_field *f;
+       u64 fields_have = 0;
+       unsigned nr_devices = 0;
+
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 44);
+
+       mi = bch2_sb_get_members(sb);
+       if (mi) {
+               struct bch_member *m;
+
+               for (m = mi->members;
+                    m < mi->members + sb->nr_devices;
+                    m++)
+                       nr_devices += bch2_member_exists(m);
+       }
+
+       prt_printf(out, "External UUID:");
+       prt_tab(out);
+       pr_uuid(out, sb->user_uuid.b);
+       prt_newline(out);
+
+       prt_printf(out, "Internal UUID:");
+       prt_tab(out);
+       pr_uuid(out, sb->uuid.b);
+       prt_newline(out);
+
+       prt_str(out, "Device index:");
+       prt_tab(out);
+       prt_printf(out, "%u", sb->dev_idx);
+       prt_newline(out);
+
+       prt_str(out, "Label:");
+       prt_tab(out);
+       prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+       prt_newline(out);
+
+       prt_str(out, "Version:");
+       prt_tab(out);
+       prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+       prt_newline(out);
+
+       prt_printf(out, "Oldest version on disk:");
+       prt_tab(out);
+       prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+       prt_newline(out);
+
+       prt_printf(out, "Created:");
+       prt_tab(out);
+       if (sb->time_base_lo)
+               pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+       else
+               prt_printf(out, "(not set)");
+       prt_newline(out);
+
+       prt_printf(out, "Sequence number:");
+       prt_tab(out);
+       prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+       prt_newline(out);
+
+       prt_printf(out, "Superblock size:");
+       prt_tab(out);
+       prt_printf(out, "%zu", vstruct_bytes(sb));
+       prt_newline(out);
+
+       prt_printf(out, "Clean:");
+       prt_tab(out);
+       prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+       prt_newline(out);
+
+       prt_printf(out, "Devices:");
+       prt_tab(out);
+       prt_printf(out, "%u", nr_devices);
+       prt_newline(out);
+
+       prt_printf(out, "Sections:");
+       vstruct_for_each(sb, f)
+               fields_have |= 1 << le32_to_cpu(f->type);
+       prt_tab(out);
+       prt_bitflags(out, bch2_sb_fields, fields_have);
+       prt_newline(out);
+
+       prt_printf(out, "Features:");
+       prt_tab(out);
+       prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+       prt_newline(out);
+
+       prt_printf(out, "Compat features:");
+       prt_tab(out);
+       prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+       prt_newline(out);
+
+       prt_newline(out);
+       prt_printf(out, "Options:");
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+       {
+               enum bch_opt_id id;
+
+               for (id = 0; id < bch2_opts_nr; id++) {
+                       const struct bch_option *opt = bch2_opt_table + id;
+
+                       if (opt->get_sb != BCH2_NO_SB_OPT) {
+                               u64 v = bch2_opt_from_sb(sb, id);
+
+                               prt_printf(out, "%s:", opt->attr.name);
+                               prt_tab(out);
+                               bch2_opt_to_text(out, NULL, sb, opt, v,
+                                                OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+                               prt_newline(out);
+                       }
+               }
+       }
+
+       printbuf_indent_sub(out, 2);
+
+       if (print_layout) {
+               prt_newline(out);
+               prt_printf(out, "layout:");
+               prt_newline(out);
+               printbuf_indent_add(out, 2);
+               bch2_sb_layout_to_text(out, &sb->layout);
+               printbuf_indent_sub(out, 2);
+       }
+
+       vstruct_for_each(sb, f)
+               if (fields & (1 << le32_to_cpu(f->type))) {
+                       prt_newline(out);
+                       bch2_sb_field_to_text(out, sb, f);
+               }
 }
index 3b425bed17c48c51a552ae0e2b8e73bde01a91c3..14a25f6fe29a5756bd6dd218e113564afc3ac32f 100644 (file)
@@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
                __bch2_check_set_feature(c, feat);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-       return j
-               ? (__le64 *) vstruct_end(&j->field) - j->buckets
-               : 0;
-}
-
 /* BCH_SB_FIELD_members: */
 
 static inline bool bch2_member_exists(struct bch_member *m)
@@ -112,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                .durability     = BCH_MEMBER_DURABILITY(mi)
                        ? BCH_MEMBER_DURABILITY(mi) - 1
                        : 1,
+               .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
                .valid          = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
        };
 }
@@ -121,12 +113,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 void bch2_journal_super_entries_add_common(struct bch_fs *,
                                           struct jset_entry **, u64);
 
-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
 
 int bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);
 
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
                           struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
 
 #endif /* _BCACHEFS_SUPER_IO_H */
index b36e6216a8a10a2ec12b0de39ba954424d81d45d..5be4c40afa47500725e6ea95bb8ab0b0d3acebed 100644 (file)
@@ -24,6 +24,7 @@
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "counters.h"
 
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/debugfs.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
 #include <linux/idr.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/pretty-printers.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
 #include <crypto/hash.h>
@@ -63,14 +65,26 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
 
 #define KTYPE(type)                                                    \
-struct kobj_type type ## _ktype = {                                    \
+static const struct attribute_group type ## _group = {                 \
+       .attrs = type ## _files                                         \
+};                                                                     \
+                                                                       \
+static const struct attribute_group *type ## _groups[] = {             \
+       &type ## _group,                                                \
+       NULL                                                            \
+};                                                                     \
+                                                                       \
+static const struct kobj_type type ## _ktype = {                       \
        .release        = type ## _release,                             \
        .sysfs_ops      = &type ## _sysfs_ops,                          \
-       .default_attrs  = type ## _files                                \
+       .default_groups = type ## _groups                               \
 }
 
 static void bch2_fs_release(struct kobject *);
 static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
 
 static void bch2_fs_internal_release(struct kobject *k)
 {
@@ -84,11 +98,12 @@ static void bch2_fs_time_stats_release(struct kobject *k)
 {
 }
 
-static KTYPE(bch2_fs);
-static KTYPE(bch2_fs_internal);
-static KTYPE(bch2_fs_opts_dir);
-static KTYPE(bch2_fs_time_stats);
-static KTYPE(bch2_dev);
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
 
 static struct kset *bcachefs_kset;
 static LIST_HEAD(bch_fs_list);
@@ -188,71 +203,33 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i, clean_passes = 0;
+       u64 seq = 0;
 
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       /*
-        * Flush journal before stopping allocators, because flushing journal
-        * blacklist entries involves allocating new btree nodes:
-        */
-       bch2_journal_flush_all_pins(&c->journal);
-
-       /*
-        * If the allocator threads didn't all start up, the btree updates to
-        * write out alloc info aren't going to work:
-        */
-       if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-               goto nowrote_alloc;
-
        bch_verbose(c, "flushing journal and stopping allocators");
 
-       bch2_journal_flush_all_pins(&c->journal);
-       set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
        do {
                clean_passes++;
 
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
-
-               /*
-                * In flight interior btree updates will generate more journal
-                * updates and btree updates (alloc btree):
-                */
-               if (bch2_btree_interior_updates_nr_pending(c)) {
-                       closure_wait_event(&c->btree_interior_update_wait,
-                                          !bch2_btree_interior_updates_nr_pending(c));
+               if (bch2_btree_interior_updates_flush(c) ||
+                   bch2_journal_flush_all_pins(&c->journal) ||
+                   bch2_btree_flush_all_writes(c) ||
+                   seq != atomic64_read(&c->journal.seq)) {
+                       seq = atomic64_read(&c->journal.seq);
                        clean_passes = 0;
                }
-               flush_work(&c->btree_interior_update_work);
-
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
        } while (clean_passes < 2);
-       bch_verbose(c, "flushing journal and stopping allocators complete");
-
-       set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-       flush_work(&c->btree_interior_update_work);
-
-       for_each_member_device(ca, c, i)
-               bch2_dev_allocator_stop(ca);
 
-       clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-       clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
+       bch_verbose(c, "flushing journal and stopping allocators complete");
 
+       if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
        bch2_fs_journal_stop(&c->journal);
 
-       /*
-        * the journal kicks off btree writes via reclaim - wait for in flight
-        * writes after stopping journal:
-        */
-       bch2_btree_flush_all_writes(c);
-
        /*
         * After stopping journal:
         */
@@ -280,10 +257,6 @@ void bch2_fs_read_only(struct bch_fs *c)
        /*
         * Block new foreground-end write operations from starting - any new
         * writes will return -EROFS:
-        *
-        * (This is really blocking new _allocations_, writes to previously
-        * allocated space can still happen until stopping the allocator in
-        * bch2_dev_allocator_stop()).
         */
        percpu_ref_kill(&c->writes);
 
@@ -315,7 +288,7 @@ void bch2_fs_read_only(struct bch_fs *c)
            !test_bit(BCH_FS_ERROR, &c->flags) &&
            !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
            test_bit(BCH_FS_STARTED, &c->flags) &&
-           test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+           test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
            !c->opts.norecovery) {
                bch_verbose(c, "marking filesystem clean");
                bch2_fs_mark_clean(c);
@@ -354,26 +327,12 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 {
        int ret;
 
-       ret = bch2_gc_thread_start(c);
-       if (ret) {
-               bch_err(c, "error starting gc thread");
-               return ret;
-       }
-
-       ret = bch2_copygc_start(c);
-       if (ret) {
-               bch_err(c, "error starting copygc thread");
-               return ret;
-       }
-
        ret = bch2_rebalance_start(c);
        if (ret) {
                bch_err(c, "error starting rebalance thread");
                return ret;
        }
 
-       schedule_work(&c->ec_stripe_delete_work);
-
        return 0;
 }
 
@@ -406,25 +365,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        if (ret)
                goto err;
 
-       clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+       clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 
        for_each_rw_member(ca, c, i)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       for_each_rw_member(ca, c, i) {
-               ret = bch2_dev_allocator_start(ca);
-               if (ret) {
-                       bch_err(c, "error starting allocator threads");
-                       percpu_ref_put(&ca->io_ref);
-                       goto err;
-               }
+       ret = bch2_gc_thread_start(c);
+       if (ret) {
+               bch_err(c, "error starting gc thread");
+               return ret;
        }
 
-       set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+       ret = bch2_copygc_start(c);
+       if (ret) {
+               bch_err(c, "error starting copygc thread");
+               return ret;
+       }
 
-       for_each_rw_member(ca, c, i)
-               bch2_wake_allocator(ca);
+       schedule_work(&c->ec_stripe_delete_work);
+
+       bch2_do_discards(c);
+       bch2_do_invalidates(c);
 
        if (!early) {
                ret = bch2_fs_read_write_late(c);
@@ -463,6 +425,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
 
+       bch2_fs_counters_exit(c);
        bch2_fs_snapshots_exit(c);
        bch2_fs_quota_exit(c);
        bch2_fs_fsio_exit(c);
@@ -480,7 +443,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        bch2_journal_keys_free(&c->journal_keys);
-       bch2_journal_entries_free(&c->journal_entries);
+       bch2_journal_entries_free(c);
        percpu_free_rwsem(&c->mark_lock);
 
        if (c->btree_paths_bufs)
@@ -500,8 +463,8 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->unused_inode_hints);
        free_heap(&c->copygc_heap);
 
-       if (c->io_complete_wq )
-               destroy_workqueue(c->io_complete_wq );
+       if (c->io_complete_wq)
+               destroy_workqueue(c->io_complete_wq);
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
        if (c->btree_io_complete_wq)
@@ -547,6 +510,7 @@ void __bch2_fs_stop(struct bch_fs *c)
        bch2_fs_debug_exit(c);
        bch2_fs_chardev_exit(c);
 
+       kobject_put(&c->counters_kobj);
        kobject_put(&c->time_stats);
        kobject_put(&c->opts_dir);
        kobject_put(&c->internal);
@@ -615,6 +579,7 @@ static int bch2_fs_online(struct bch_fs *c)
            kobject_add(&c->internal, &c->kobj, "internal") ?:
            kobject_add(&c->opts_dir, &c->kobj, "options") ?:
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+           kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
            bch2_opts_create_sysfs_files(&c->opts_dir);
        if (ret) {
                bch_err(c, "error creating sysfs objects");
@@ -643,6 +608,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
        struct bch_sb_field_members *mi;
        struct bch_fs *c;
+       struct printbuf name = PRINTBUF;
        unsigned i, iter_size;
        int ret = 0;
 
@@ -663,6 +629,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        kobject_init(&c->internal, &bch2_fs_internal_ktype);
        kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
        kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+       kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
 
        c->minor                = -1;
        c->disk_sb.fs_sb        = true;
@@ -685,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        bch2_fs_allocator_foreground_init(c);
        bch2_fs_rebalance_init(c);
        bch2_fs_quota_init(c);
+       bch2_fs_ec_init_early(c);
 
        INIT_LIST_HEAD(&c->list);
 
@@ -698,7 +666,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_WORK(&c->journal_seq_blacklist_gc_work,
                  bch2_blacklist_entries_gc);
 
-       INIT_LIST_HEAD(&c->journal_entries);
        INIT_LIST_HEAD(&c->journal_iters);
 
        INIT_LIST_HEAD(&c->fsck_errors);
@@ -719,8 +686,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        seqcount_init(&c->usage_lock);
 
-       sema_init(&c->io_in_flight, 64);
-
        c->copy_gc_enabled              = 1;
        c->rebalance.enabled            = 1;
        c->promote_whole_extents        = true;
@@ -745,7 +710,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        if (ret)
                goto err;
 
-       uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+       pr_uuid(&name, c->sb.user_uuid.b);
+       strscpy(c->name, name.buf, sizeof(c->name));
+       printbuf_exit(&name);
+
+       ret = name.allocation_failure ? -ENOMEM : 0;
+       if (ret)
+               goto err;
 
        /* Compat: */
        if (sb->version <= bcachefs_metadata_version_inode_v2 &&
@@ -812,7 +783,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                goto err;
        }
 
-       ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+       ret = bch2_fs_counters_init(c) ?:
+           bch2_io_clock_init(&c->io_clock[READ]) ?:
            bch2_io_clock_init(&c->io_clock[WRITE]) ?:
            bch2_fs_journal_init(&c->journal) ?:
            bch2_fs_replicas_init(c) ?:
@@ -820,7 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
            bch2_fs_btree_iter_init(c) ?:
            bch2_fs_btree_interior_update_init(c) ?:
-           bch2_fs_buckets_waiting_for_journal_init(c);
+           bch2_fs_buckets_waiting_for_journal_init(c) ?:
            bch2_fs_subvolumes_init(c) ?:
            bch2_fs_io_init(c) ?:
            bch2_fs_encryption_init(c) ?:
@@ -830,9 +802,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        if (ret)
                goto err;
 
-       if (c->opts.nochanges)
-               set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
        mi = bch2_sb_get_members(c->disk_sb.sb);
        for (i = 0; i < c->sb.nr_devices; i++)
                if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -868,14 +837,11 @@ noinline_for_stack
 static void print_mount_opts(struct bch_fs *c)
 {
        enum bch_opt_id i;
-       char buf[512];
-       struct printbuf p = PBUF(buf);
+       struct printbuf p = PRINTBUF;
        bool first = true;
 
-       strcpy(buf, "(null)");
-
        if (c->opts.read_only) {
-               pr_buf(&p, "ro");
+               prt_printf(&p, "ro");
                first = false;
        }
 
@@ -890,12 +856,16 @@ static void print_mount_opts(struct bch_fs *c)
                        continue;
 
                if (!first)
-                       pr_buf(&p, ",");
+                       prt_printf(&p, ",");
                first = false;
-               bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+               bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
        }
 
-       bch_info(c, "mounted with opts: %s", buf);
+       if (!p.pos)
+               prt_printf(&p, "(null)");
+
+       bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+       printbuf_exit(&p);
 }
 
 int bch2_fs_start(struct bch_fs *c)
@@ -925,6 +895,12 @@ int bch2_fs_start(struct bch_fs *c)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
+       for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+               mutex_lock(&c->btree_transaction_stats[i].lock);
+               bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+               mutex_unlock(&c->btree_transaction_stats[i].lock);
+       }
+
        ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
                ? bch2_fs_recovery(c)
                : bch2_fs_initialize(c);
@@ -943,20 +919,6 @@ int bch2_fs_start(struct bch_fs *c)
 
        set_bit(BCH_FS_STARTED, &c->flags);
 
-       /*
-        * Allocator threads don't start filling copygc reserve until after we
-        * set BCH_FS_STARTED - wake them now:
-        *
-        * XXX ugly hack:
-        * Need to set ca->allocator_state here instead of relying on the
-        * allocator threads to do it to avoid racing with the copygc threads
-        * checking it and thinking they have no alloc reserve:
-        */
-       for_each_online_member(ca, c, i) {
-               ca->allocator_state = ALLOCATOR_running;
-               bch2_wake_allocator(ca);
-       }
-
        if (c->opts.read_only || c->opts.nochanges) {
                bch2_fs_read_only(c);
        } else {
@@ -973,31 +935,10 @@ out:
        up_write(&c->state_lock);
        return ret;
 err:
-       switch (ret) {
-       case BCH_FSCK_ERRORS_NOT_FIXED:
-               bch_err(c, "filesystem contains errors: please report this to the developers");
-               pr_cont("mount with -o fix_errors to repair\n");
-               break;
-       case BCH_FSCK_REPAIR_UNIMPLEMENTED:
-               bch_err(c, "filesystem contains errors: please report this to the developers");
-               pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-               break;
-       case BCH_FSCK_REPAIR_IMPOSSIBLE:
-               bch_err(c, "filesystem contains errors, but repair impossible");
-               break;
-       case BCH_FSCK_UNKNOWN_VERSION:
-               bch_err(c, "unknown metadata version");
-               break;
-       case -ENOMEM:
-               bch_err(c, "cannot allocate memory");
-               break;
-       case -EIO:
-               bch_err(c, "IO error");
-               break;
-       }
+       bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
 
-       if (ret >= 0)
-               ret = -EIO;
+       if (ret < -BCH_ERR_START)
+               ret = -EINVAL;
        goto out;
 }
 
@@ -1048,8 +989,6 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
-       bch2_dev_allocator_stop(ca);
-
        cancel_work_sync(&ca->io_error_work);
 
        if (ca->kobj.state_in_sysfs &&
@@ -1164,8 +1103,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
        ca->mi = bch2_mi_to_cpu(member);
        ca->uuid = member->uuid;
 
-       if (opt_defined(c->opts, discard))
-               ca->mi.discard = opt_get(c->opts, discard);
+       ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+                            ca->mi.bucket_size / btree_sectors(c));
 
        if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
                            0, GFP_KERNEL) ||
@@ -1216,12 +1155,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
        ca->fs = c;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw &&
-           bch2_dev_allocator_start(ca)) {
-               bch2_dev_free(ca);
-               goto err;
-       }
-
        bch2_dev_attach(c, ca, dev_idx);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
@@ -1297,8 +1230,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
        bch2_dev_sysfs_online(c, ca);
 
        if (c->sb.nr_devices == 1)
-               bdevname(ca->disk_sb.bdev, c->name);
-       bdevname(ca->disk_sb.bdev, ca->name);
+               snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+       snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
 
        rebalance_wakeup(c);
        return 0;
@@ -1398,23 +1331,14 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-       /*
-        * Device going read only means the copygc reserve get smaller, so we
-        * don't want that happening while copygc is in progress:
-        */
-       bch2_copygc_stop(c);
-
        /*
         * The allocator thread itself allocates btree nodes, so stop it first:
         */
-       bch2_dev_allocator_stop(ca);
        bch2_dev_allocator_remove(c, ca);
        bch2_dev_journal_stop(&c->journal, ca);
-
-       bch2_copygc_start(c);
 }
 
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
        lockdep_assert_held(&c->state_lock);
 
@@ -1422,8 +1346,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
        bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
-
-       return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1450,7 +1372,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        mutex_unlock(&c->sb_lock);
 
        if (new_state == BCH_MEMBER_STATE_rw)
-               ret = __bch2_dev_read_write(c, ca);
+               __bch2_dev_read_write(c, ca);
 
        rebalance_wakeup(c);
 
@@ -1473,30 +1395,28 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct btree_trans trans;
-       size_t i;
+       struct bpos start       = POS(ca->dev_idx, 0);
+       struct bpos end         = POS(ca->dev_idx, U64_MAX);
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for (i = 0; i < ca->mi.nbuckets; i++) {
-               ret = lockrestart_do(&trans,
-                       bch2_btree_key_cache_flush(&trans,
-                               BTREE_ID_alloc, POS(ca->dev_idx, i)));
-               if (ret)
-                       break;
-       }
-       bch2_trans_exit(&trans);
-
-       if (ret) {
-               bch_err(c, "error %i removing dev alloc info", ret);
-               return ret;
-       }
+       /*
+        * We clear the LRU and need_discard btrees first so that we don't race
+        * with bch2_do_invalidates() and bch2_do_discards()
+        */
+       ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL);
+       if (ret)
+               bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
 
-       return bch2_btree_delete_range(c, BTREE_ID_alloc,
-                                      POS(ca->dev_idx, 0),
-                                      POS(ca->dev_idx + 1, 0),
-                                      0, NULL);
+       return ret;
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1522,32 +1442,23 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
        if (ret) {
-               bch_err(ca, "Remove failed: error %i dropping data", ret);
+               bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
                goto err;
        }
 
-       ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+       ret = bch2_dev_remove_alloc(c, ca);
        if (ret) {
-               bch_err(ca, "Remove failed: error %i flushing journal", ret);
+               bch_err(ca, "Remove failed, error deleting alloc info");
                goto err;
        }
 
-       ret = bch2_dev_remove_alloc(c, ca);
+       ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
        if (ret) {
-               bch_err(ca, "Remove failed, error deleting alloc info");
+               bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
                goto err;
        }
 
-       /*
-        * must flush all existing journal entries, they might have
-        * (overwritten) keys that point to the device we're removing:
-        */
-       bch2_journal_flush_all_pins(&c->journal);
-       /*
-        * hack to ensure bch2_replicas_gc2() clears out entries to this device
-        */
-       bch2_journal_meta(&c->journal);
-       ret = bch2_journal_error(&c->journal);
+       ret = bch2_journal_flush(&c->journal);
        if (ret) {
                bch_err(ca, "Remove failed, journal error");
                goto err;
@@ -1555,17 +1466,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        ret = bch2_replicas_gc2(c);
        if (ret) {
-               bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+               bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
                goto err;
        }
 
        data = bch2_dev_has_data(c, ca);
        if (data) {
-               char data_has_str[100];
+               struct printbuf data_has = PRINTBUF;
 
-               bch2_flags_to_text(&PBUF(data_has_str),
-                                  bch2_data_types, data);
-               bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+               prt_bitflags(&data_has, bch2_data_types, data);
+               bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+               printbuf_exit(&data_has);
                ret = -EBUSY;
                goto err;
        }
@@ -1614,24 +1525,26 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        struct bch_sb_field_members *mi;
        struct bch_member dev_mi;
        unsigned dev_idx, nr_devices, u64s;
-       char *_errbuf;
-       struct printbuf errbuf;
+       struct printbuf errbuf = PRINTBUF;
+       struct printbuf label = PRINTBUF;
        int ret;
 
-       _errbuf = kmalloc(4096, GFP_KERNEL);
-       if (!_errbuf)
-               return -ENOMEM;
-
-       errbuf = _PBUF(_errbuf, 4096);
-
        ret = bch2_read_super(path, &opts, &sb);
        if (ret) {
-               bch_err(c, "device add error: error reading super: %i", ret);
+               bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
                goto err;
        }
 
        dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
 
+       if (BCH_MEMBER_GROUP(&dev_mi)) {
+               bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+               if (label.allocation_failure) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+       }
+
        err = bch2_dev_may_add(sb.sb, c);
        if (err) {
                bch_err(c, "device add error: %s", err);
@@ -1646,6 +1559,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                goto err;
        }
 
+       bch2_dev_usage_init(ca);
+
        ret = __bch2_dev_attach_bdev(ca, &sb);
        if (ret) {
                bch2_dev_free(ca);
@@ -1673,7 +1588,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                                le32_to_cpu(mi->field.u64s) +
                                sizeof(dev_mi) / sizeof(u64))) {
                bch_err(c, "device add error: new device superblock too small");
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_sb_members;
                goto err_unlock;
        }
 
@@ -1686,7 +1601,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                        goto have_slot;
 no_slot:
        bch_err(c, "device add error: already have maximum number of devices");
-       ret = -ENOSPC;
+       ret = -BCH_ERR_ENOSPC_sb_members;
        goto err_unlock;
 
 have_slot:
@@ -1697,7 +1612,7 @@ have_slot:
        mi = bch2_sb_resize_members(&c->disk_sb, u64s);
        if (!mi) {
                bch_err(c, "device add error: no room in superblock for member info");
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_sb_members;
                goto err_unlock;
        }
 
@@ -1710,6 +1625,14 @@ have_slot:
        ca->disk_sb.sb->dev_idx = dev_idx;
        bch2_dev_attach(c, ca, dev_idx);
 
+       if (BCH_MEMBER_GROUP(&dev_mi)) {
+               ret = __bch2_dev_group_set(c, ca, label.buf);
+               if (ret) {
+                       bch_err(c, "device add error: error setting label");
+                       goto err_unlock;
+               }
+       }
+
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
@@ -1717,19 +1640,20 @@ have_slot:
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "device add error: error marking new superblock: %i", ret);
+               bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+               goto err_late;
+       }
+
+       ret = bch2_fs_freespace_init(c);
+       if (ret) {
+               bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
                goto err_late;
        }
 
        ca->new_fs_bucket_idx = 0;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               ret = __bch2_dev_read_write(c, ca);
-               if (ret) {
-                       bch_err(c, "device add error: error going RW on new device: %i", ret);
-                       goto err_late;
-               }
-       }
+       if (ca->mi.state == BCH_MEMBER_STATE_rw)
+               __bch2_dev_read_write(c, ca);
 
        up_write(&c->state_lock);
        return 0;
@@ -1741,7 +1665,8 @@ err:
        if (ca)
                bch2_dev_free(ca);
        bch2_free_super(&sb);
-       kfree(_errbuf);
+       printbuf_exit(&label);
+       printbuf_exit(&errbuf);
        return ret;
 err_late:
        up_write(&c->state_lock);
@@ -1784,16 +1709,13 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
-                       path, ret);
+               bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
+                       path, bch2_err_str(ret));
                goto err;
        }
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               ret = __bch2_dev_read_write(c, ca);
-               if (ret)
-                       goto err;
-       }
+       if (ca->mi.state == BCH_MEMBER_STATE_rw)
+               __bch2_dev_read_write(c, ca);
 
        mutex_lock(&c->sb_lock);
        mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1857,14 +1779,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
        ret = bch2_dev_buckets_resize(c, ca, nbuckets);
        if (ret) {
-               bch_err(ca, "Resize error: %i", ret);
+               bch_err(ca, "Resize error: %s", bch2_err_str(ret));
                goto err;
        }
 
        ret = bch2_trans_mark_dev_sb(c, ca);
-       if (ret) {
+       if (ret)
                goto err;
-       }
 
        mutex_lock(&c->sb_lock);
        mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
@@ -1906,8 +1827,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        struct bch_sb_field_members *mi;
        unsigned i, best_sb = 0;
        const char *err;
-       char *_errbuf = NULL;
-       struct printbuf errbuf;
+       struct printbuf errbuf = PRINTBUF;
        int ret = 0;
 
        if (!try_module_get(THIS_MODULE))
@@ -1920,14 +1840,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                goto err;
        }
 
-       _errbuf = kmalloc(4096, GFP_KERNEL);
-       if (!_errbuf) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       errbuf = _PBUF(_errbuf, 4096);
-
        sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
        if (!sb) {
                ret = -ENOMEM;
@@ -1952,9 +1864,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        while (i < nr_devices) {
                if (i != best_sb &&
                    !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
-                       char buf[BDEVNAME_SIZE];
-                       pr_info("%s has been removed, skipping",
-                               bdevname(sb[i].bdev, buf));
+                       pr_info("%pg has been removed, skipping", sb[i].bdev);
                        bch2_free_super(&sb[i]);
                        array_remove_item(sb, nr_devices, i);
                        continue;
@@ -1993,7 +1903,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        }
 out:
        kfree(sb);
-       kfree(_errbuf);
+       printbuf_exit(&errbuf);
        module_put(THIS_MODULE);
        pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
        return c;
index 3f24ca5a853d2144b25babf1b66352418a566231..8501adaff4c2f3286e1f42755732de93366e5817 100644 (file)
@@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
        return remainder;
 }
 
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+                                                u32 *offset)
+{
+       return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
 static inline bool bch2_dev_is_online(struct bch_dev *ca)
 {
        return !percpu_ref_is_zero(&ca->io_ref);
@@ -83,7 +89,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
                                         unsigned dev)
 {
        BUG_ON(bch2_dev_list_has_dev(*devs, dev));
-       BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+       BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
        devs->devs[devs->nr++] = dev;
 }
 
index d8b159a5b7f78ccacdd87236ffc32224bdc1baee..89419fc7930d004f5b68cc80a53630ac625003d3 100644 (file)
@@ -32,6 +32,7 @@ struct bch_member_cpu {
        u8                      discard;
        u8                      data_allowed;
        u8                      durability;
+       u8                      freespace_initialized;
        u8                      valid;
 };
 
index b727845dd64b73d4ea51fe8842c9f03ca899ad23..0f45aef78477326b1f10dd1aeae4fabe385097de 100644 (file)
 #include "tests.h"
 
 #include <linux/blkdev.h>
+#include <linux/pretty-printers.h>
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
 #include "util.h"
 
 #define SYSFS_OPS(type)                                                        \
-struct sysfs_ops type ## _sysfs_ops = {                                        \
+const struct sysfs_ops type ## _sysfs_ops = {                          \
        .show   = type ## _show,                                        \
        .store  = type ## _store                                        \
 }
 
 #define SHOW(fn)                                                       \
+static ssize_t fn ## _to_text(struct printbuf *,                       \
+                             struct kobject *, struct attribute *);    \
+                                                                       \
 static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
                           char *buf)                                   \
+{                                                                      \
+       struct printbuf out = PRINTBUF;                                 \
+       ssize_t ret = fn ## _to_text(&out, kobj, attr);                 \
+                                                                       \
+       if (out.pos && out.buf[out.pos - 1] != '\n')                    \
+               prt_newline(&out);                                      \
+                                                                       \
+       if (!ret && out.allocation_failure)                             \
+               ret = -ENOMEM;                                          \
+                                                                       \
+       if (!ret) {                                                     \
+               ret = min_t(size_t, out.pos, PAGE_SIZE - 1);            \
+               memcpy(buf, out.buf, ret);                              \
+       }                                                               \
+       printbuf_exit(&out);                                            \
+       return bch2_err_class(ret);                                     \
+}                                                                      \
+                                                                       \
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+                             struct attribute *attr)
 
 #define STORE(fn)                                                      \
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+                           const char *, size_t);                      \
+                                                                       \
 static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
                            const char *buf, size_t size)               \
+{                                                                      \
+       return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
+}                                                                      \
+                                                                       \
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+                                 const char *buf, size_t size)
 
 #define __sysfs_attribute(_name, _mode)                                        \
        static struct attribute sysfs_##_name =                         \
@@ -64,22 +97,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 #define sysfs_printf(file, fmt, ...)                                   \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
-               return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+               prt_printf(out, fmt "\n", __VA_ARGS__);                 \
 } while (0)
 
 #define sysfs_print(file, var)                                         \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
-               return snprint(buf, PAGE_SIZE, var);                    \
+               snprint(out, var);                                      \
 } while (0)
 
 #define sysfs_hprint(file, val)                                                \
 do {                                                                   \
-       if (attr == &sysfs_ ## file) {                                  \
-               bch2_hprint(&out, val);                                 \
-               pr_buf(&out, "\n");                                     \
-               return out.pos - buf;                                   \
-       }                                                               \
+       if (attr == &sysfs_ ## file)                                    \
+               prt_human_readable_s64(out, val);                       \
 } while (0)
 
 #define var_printf(_var, fmt)  sysfs_printf(_var, fmt, var(_var))
@@ -133,7 +163,10 @@ do {                                                                       \
 } while (0)
 
 write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
 write_attribute(prune_cache);
+write_attribute(btree_wakeup);
 rw_attribute(btree_gc_periodic);
 rw_attribute(gc_gens_pos);
 
@@ -142,7 +175,7 @@ read_attribute(minor);
 read_attribute(bucket_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
-read_attribute(durability);
+rw_attribute(durability);
 read_attribute(iodone);
 
 read_attribute(io_latency_read);
@@ -153,16 +186,12 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
-read_attribute(journal_pins);
 read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
-read_attribute(btree_transactions);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
 
@@ -170,11 +199,10 @@ read_attribute(internal_uuid);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-write_attribute(wake_allocator);
 
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
 
 rw_attribute(discard);
 rw_attribute(label);
@@ -237,12 +265,12 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->data_progress_lock);
        list_for_each_entry(stats, &c->data_progress_list, list) {
-               pr_buf(out, "%s: data type %s btree_id %s position: ",
+               prt_printf(out, "%s: data type %s btree_id %s position: ",
                       stats->name,
                       bch2_data_types[stats->data_type],
                       bch2_btree_ids[stats->btree_id]);
                bch2_bpos_to_text(out, stats->pos);
-               pr_buf(out, "%s", "\n");
+               prt_printf(out, "%s", "\n");
        }
 
        mutex_unlock(&c->data_progress_lock);
@@ -270,7 +298,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
        bch2_trans_init(&trans, c, 0, 0);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
-               if (!((1U << id) & BTREE_ID_HAS_PTRS))
+               if (!btree_type_has_ptrs(id))
                        continue;
 
                for_each_btree_key(&trans, iter, id, POS_MIN,
@@ -315,40 +343,54 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
        if (ret)
                return ret;
 
-       pr_buf(out, "uncompressed:\n");
-       pr_buf(out, "   nr extents:             %llu\n", nr_uncompressed_extents);
-       pr_buf(out, "   size:                   ");
-       bch2_hprint(out, uncompressed_sectors << 9);
-       pr_buf(out, "\n");
-
-       pr_buf(out, "compressed:\n");
-       pr_buf(out, "   nr extents:             %llu\n", nr_compressed_extents);
-       pr_buf(out, "   compressed size:        ");
-       bch2_hprint(out, compressed_sectors_compressed << 9);
-       pr_buf(out, "\n");
-       pr_buf(out, "   uncompressed size:      ");
-       bch2_hprint(out, compressed_sectors_uncompressed << 9);
-       pr_buf(out, "\n");
-
-       pr_buf(out, "incompressible:\n");
-       pr_buf(out, "   nr extents:             %llu\n", nr_incompressible_extents);
-       pr_buf(out, "   size:                   ");
-       bch2_hprint(out, incompressible_sectors << 9);
-       pr_buf(out, "\n");
+       prt_printf(out, "uncompressed:\n");
+       prt_printf(out, "       nr extents:             %llu\n", nr_uncompressed_extents);
+       prt_printf(out, "       size:                   ");
+       prt_human_readable_u64(out, uncompressed_sectors << 9);
+       prt_printf(out, "\n");
+
+       prt_printf(out, "compressed:\n");
+       prt_printf(out, "       nr extents:             %llu\n", nr_compressed_extents);
+       prt_printf(out, "       compressed size:        ");
+       prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+       prt_printf(out, "\n");
+       prt_printf(out, "       uncompressed size:      ");
+       prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+       prt_printf(out, "\n");
+
+       prt_printf(out, "incompressible:\n");
+       prt_printf(out, "       nr extents:             %llu\n", nr_incompressible_extents);
+       prt_printf(out, "       size:                   ");
+       prt_human_readable_u64(out, incompressible_sectors << 9);
+       prt_printf(out, "\n");
        return 0;
 }
 
 static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+       prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
        bch2_bpos_to_text(out, c->gc_gens_pos);
-       pr_buf(out, "\n");
+       prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+       struct btree_trans *trans;
+
+       mutex_lock(&c->btree_trans_lock);
+       list_for_each_entry(trans, &c->btree_trans_list, list) {
+               struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+               if (b)
+                       six_lock_wakeup_all(&b->lock);
+
+       }
+       mutex_unlock(&c->btree_trans_lock);
 }
 
 SHOW(bch2_fs)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-       struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
        sysfs_print(minor,                      c->minor);
        sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
@@ -356,19 +398,10 @@ SHOW(bch2_fs)
        sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
        sysfs_hprint(btree_avg_write_size,      bch2_btree_avg_write_size(c));
 
-       sysfs_print(read_realloc_races,
-                   atomic_long_read(&c->read_realloc_races));
-       sysfs_print(extent_migrate_done,
-                   atomic_long_read(&c->extent_migrate_done));
-       sysfs_print(extent_migrate_raced,
-                   atomic_long_read(&c->extent_migrate_raced));
-
        sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
 
-       if (attr == &sysfs_gc_gens_pos) {
-               bch2_gc_gens_pos_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_gc_gens_pos)
+               bch2_gc_gens_pos_to_text(out, c);
 
        sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 
@@ -378,83 +411,45 @@ SHOW(bch2_fs)
                     max(0LL, c->copygc_wait -
                         atomic64_read(&c->io_clock[WRITE].now)) << 9);
 
-       if (attr == &sysfs_rebalance_work) {
-               bch2_rebalance_work_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_rebalance_work)
+               bch2_rebalance_work_to_text(out, c);
 
        sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 
        /* Debugging: */
 
-       if (attr == &sysfs_journal_debug) {
-               bch2_journal_debug_to_text(&out, &c->journal);
-               return out.pos - buf;
-       }
-
-       if (attr == &sysfs_journal_pins) {
-               bch2_journal_pins_to_text(&out, &c->journal);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_journal_debug)
+               bch2_journal_debug_to_text(out, &c->journal);
 
-       if (attr == &sysfs_btree_updates) {
-               bch2_btree_updates_to_text(&out, c);
-               return out.pos - buf;
-       }
-
-       if (attr == &sysfs_dirty_btree_nodes) {
-               bch2_dirty_btree_nodes_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_btree_updates)
+               bch2_btree_updates_to_text(out, c);
 
-       if (attr == &sysfs_btree_cache) {
-               bch2_btree_cache_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_btree_cache)
+               bch2_btree_cache_to_text(out, &c->btree_cache);
 
-       if (attr == &sysfs_btree_key_cache) {
-               bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_btree_key_cache)
+               bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 
-       if (attr == &sysfs_btree_transactions) {
-               bch2_btree_trans_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_stripes_heap)
+               bch2_stripes_heap_to_text(out, c);
 
-       if (attr == &sysfs_stripes_heap) {
-               bch2_stripes_heap_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_open_buckets)
+               bch2_open_buckets_to_text(out, c);
 
-       if (attr == &sysfs_open_buckets) {
-               bch2_open_buckets_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_compression_stats)
+               bch2_compression_stats_to_text(out, c);
 
-       if (attr == &sysfs_compression_stats) {
-               bch2_compression_stats_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_new_stripes)
+               bch2_new_stripes_to_text(out, c);
 
-       if (attr == &sysfs_new_stripes) {
-               bch2_new_stripes_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_io_timers_read)
+               bch2_io_timers_to_text(out, &c->io_clock[READ]);
 
-       if (attr == &sysfs_io_timers_read) {
-               bch2_io_timers_to_text(&out, &c->io_clock[READ]);
-               return out.pos - buf;
-       }
-       if (attr == &sysfs_io_timers_write) {
-               bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_io_timers_write)
+               bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
-       if (attr == &sysfs_data_jobs) {
-               data_progress_to_text(&out, c);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_data_jobs)
+               data_progress_to_text(out, c);
 
        return 0;
 }
@@ -510,6 +505,9 @@ STORE(bch2_fs)
                c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
        }
 
+       if (attr == &sysfs_btree_wakeup)
+               bch2_btree_wakeup_all(c);
+
        if (attr == &sysfs_trigger_gc) {
                /*
                 * Full gc is currently incompatible with btree key cache:
@@ -523,6 +521,12 @@ STORE(bch2_fs)
 #endif
        }
 
+       if (attr == &sysfs_trigger_discards)
+               bch2_do_discards(c);
+
+       if (attr == &sysfs_trigger_invalidates)
+               bch2_do_invalidates(c);
+
 #ifdef CONFIG_BCACHEFS_TESTS
        if (attr == &sysfs_perf_test) {
                char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -562,12 +566,54 @@ struct attribute *bch2_fs_files[] = {
        NULL
 };
 
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+       u64 counter = 0;
+       u64 counter_since_mount = 0;
+
+       printbuf_tabstop_push(out, 32);
+
+       #define x(t, ...) \
+               if (attr == &sysfs_##t) {                                       \
+                       counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+                       counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+                       prt_printf(out, "since mount:");                                \
+                       prt_tab(out);                                           \
+                       prt_human_readable_u64(out, counter_since_mount << 9);  \
+                       prt_newline(out);                                       \
+                                                                               \
+                       prt_printf(out, "since filesystem creation:");          \
+                       prt_tab(out);                                           \
+                       prt_human_readable_u64(out, counter << 9);              \
+                       prt_newline(out);                                       \
+               }
+       BCH_PERSISTENT_COUNTERS()
+       #undef x
+       return 0;
+}
+
+STORE(bch2_fs_counters) {
+       return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+       &sysfs_##t,
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       NULL
+};
 /* internal dir - just a wrapper */
 
 SHOW(bch2_fs_internal)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-       return bch2_fs_show(&c->kobj, attr, buf);
+       return bch2_fs_to_text(out, &c->kobj, attr);
 }
 
 STORE(bch2_fs_internal)
@@ -579,12 +625,9 @@ SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
        &sysfs_journal_debug,
-       &sysfs_journal_pins,
        &sysfs_btree_updates,
-       &sysfs_dirty_btree_nodes,
        &sysfs_btree_cache,
        &sysfs_btree_key_cache,
-       &sysfs_btree_transactions,
        &sysfs_new_stripes,
        &sysfs_stripes_heap,
        &sysfs_open_buckets,
@@ -592,11 +635,10 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_io_timers_write,
 
        &sysfs_trigger_gc,
+       &sysfs_trigger_discards,
+       &sysfs_trigger_invalidates,
        &sysfs_prune_cache,
-
-       &sysfs_read_realloc_races,
-       &sysfs_extent_migrate_done,
-       &sysfs_extent_migrate_raced,
+       &sysfs_btree_wakeup,
 
        &sysfs_gc_gens_pos,
 
@@ -617,16 +659,15 @@ struct attribute *bch2_fs_internal_files[] = {
 
 SHOW(bch2_fs_opts_dir)
 {
-       struct printbuf out = _PBUF(buf, PAGE_SIZE);
        struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
        const struct bch_option *opt = container_of(attr, struct bch_option, attr);
        int id = opt - bch2_opt_table;
        u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-       bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
-       pr_buf(&out, "\n");
+       bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+       prt_char(out, '\n');
 
-       return out.pos - buf;
+       return 0;
 }
 
 STORE(bch2_fs_opts_dir)
@@ -637,19 +678,28 @@ STORE(bch2_fs_opts_dir)
        char *tmp;
        u64 v;
 
+       /*
+        * We don't need to take c->writes for correctness, but it eliminates an
+        * unsightly error message in the dmesg log when we're RO:
+        */
+       if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+               return -EROFS;
+
        tmp = kstrdup(buf, GFP_KERNEL);
-       if (!tmp)
-               return -ENOMEM;
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto err;
+       }
 
-       ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
+       ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
        kfree(tmp);
 
        if (ret < 0)
-               return ret;
+               goto err;
 
        ret = bch2_opt_check_may_set(c, id, v);
        if (ret < 0)
-               return ret;
+               goto err;
 
        bch2_opt_set_sb(c, opt, v);
        bch2_opt_set_by_id(&c->opts, id, v);
@@ -660,7 +710,10 @@ STORE(bch2_fs_opts_dir)
                rebalance_wakeup(c);
        }
 
-       return size;
+       ret = size;
+err:
+       percpu_ref_put(&c->writes);
+       return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);
 
@@ -690,13 +743,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 SHOW(bch2_fs_time_stats)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-       struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 #define x(name)                                                                \
-       if (attr == &sysfs_time_stat_##name) {                          \
-               bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
-               return out.pos - buf;                                   \
-       }
+       if (attr == &sysfs_time_stat_##name)                            \
+               bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
        BCH_TIME_STATS()
 #undef x
 
@@ -717,24 +767,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
        NULL
 };
 
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-       enum alloc_reserve i;
-
-       spin_lock(&ca->fs->freelist_lock);
-
-       pr_buf(out, "free_inc:\t%zu\t%zu\n",
-              fifo_used(&ca->free_inc),
-              ca->free_inc.size);
-
-       for (i = 0; i < RESERVE_NR; i++)
-               pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
-                      fifo_used(&ca->free[i]),
-                      ca->free[i].size);
-
-       spin_unlock(&ca->fs->freelist_lock);
-}
-
 static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 {
        struct bch_fs *c = ca->fs;
@@ -746,23 +778,19 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
        for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
                nr[c->open_buckets[i].data_type]++;
 
-       pr_buf(out,
-              "\t\t buckets\t sectors      fragmented\n"
-              "capacity%16llu\n",
+       prt_printf(out,
+              "\t\t\t buckets\t sectors      fragmented\n"
+              "capacity\t%16llu\n",
               ca->mi.nbuckets - ca->mi.first_bucket);
 
-       for (i = 1; i < BCH_DATA_NR; i++)
-               pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+       for (i = 0; i < BCH_DATA_NR; i++)
+               prt_printf(out, "%-16s%16llu%16llu%16llu\n",
                       bch2_data_types[i], stats.d[i].buckets,
                       stats.d[i].sectors, stats.d[i].fragmented);
 
-       pr_buf(out,
-              "ec\t%16llu\n"
-              "available%15llu\n"
+       prt_printf(out,
+              "ec\t\t%16llu\n"
               "\n"
-              "free_inc\t\t%zu/%zu\n"
-              "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
-              "free[RESERVE_NONE]\t%zu/%zu\n"
               "freelist_wait\t\t%s\n"
               "open buckets allocated\t%u\n"
               "open buckets this dev\t%u\n"
@@ -770,13 +798,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
               "open_buckets_wait\t%s\n"
               "open_buckets_btree\t%u\n"
               "open_buckets_user\t%u\n"
-              "btree reserve cache\t%u\n"
-              "thread state:\t\t%s\n",
+              "buckets_to_invalidate\t%llu\n"
+              "btree reserve cache\t%u\n",
               stats.buckets_ec,
-              __dev_buckets_available(ca, stats),
-              fifo_used(&ca->free_inc),                ca->free_inc.size,
-              fifo_used(&ca->free[RESERVE_MOVINGGC]),  ca->free[RESERVE_MOVINGGC].size,
-              fifo_used(&ca->free[RESERVE_NONE]),      ca->free[RESERVE_NONE].size,
               c->freelist_wait.list.first              ? "waiting" : "empty",
               OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
               ca->nr_open_buckets,
@@ -784,8 +808,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
               c->open_buckets_wait.list.first          ? "waiting" : "empty",
               nr[BCH_DATA_btree],
               nr[BCH_DATA_user],
-              c->btree_reserve_cache_nr,
-              bch2_allocator_states[ca->allocator_state]);
+              should_invalidate_buckets(ca, stats),
+              c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
@@ -799,10 +823,10 @@ static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
        int rw, i;
 
        for (rw = 0; rw < 2; rw++) {
-               pr_buf(out, "%s:\n", bch2_rw[rw]);
+               prt_printf(out, "%s:\n", bch2_rw[rw]);
 
                for (i = 1; i < BCH_DATA_NR; i++)
-                       pr_buf(out, "%-12s:%12llu\n",
+                       prt_printf(out, "%-12s:%12llu\n",
                               bch2_data_types[i],
                               percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
        }
@@ -812,7 +836,6 @@ SHOW(bch2_dev)
 {
        struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
        struct bch_fs *c = ca->fs;
-       struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
        sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
 
@@ -825,58 +848,42 @@ SHOW(bch2_dev)
        if (attr == &sysfs_label) {
                if (ca->mi.group) {
                        mutex_lock(&c->sb_lock);
-                       bch2_disk_path_to_text(&out, &c->disk_sb,
+                       bch2_disk_path_to_text(out, c->disk_sb.sb,
                                               ca->mi.group - 1);
                        mutex_unlock(&c->sb_lock);
                }
 
-               pr_buf(&out, "\n");
-               return out.pos - buf;
+               prt_char(out, '\n');
        }
 
        if (attr == &sysfs_has_data) {
-               bch2_flags_to_text(&out, bch2_data_types,
-                                  bch2_dev_has_data(c, ca));
-               pr_buf(&out, "\n");
-               return out.pos - buf;
+               prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+               prt_char(out, '\n');
        }
 
        if (attr == &sysfs_state_rw) {
-               bch2_string_opt_to_text(&out, bch2_member_states,
-                                       ca->mi.state);
-               pr_buf(&out, "\n");
-               return out.pos - buf;
+               prt_string_option(out, bch2_member_states, ca->mi.state);
+               prt_char(out, '\n');
        }
 
-       if (attr == &sysfs_iodone) {
-               dev_iodone_to_text(&out, ca);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_iodone)
+               dev_iodone_to_text(out, ca);
 
        sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
        sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
 
-       if (attr == &sysfs_io_latency_stats_read) {
-               bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
-               return out.pos - buf;
-       }
-       if (attr == &sysfs_io_latency_stats_write) {
-               bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_io_latency_stats_read)
+               bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+       if (attr == &sysfs_io_latency_stats_write)
+               bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
 
        sysfs_printf(congested,                 "%u%%",
                     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
                     * 100 / CONGESTED_MAX);
 
-       if (attr == &sysfs_reserve_stats) {
-               reserve_stats_to_text(&out, ca);
-               return out.pos - buf;
-       }
-       if (attr == &sysfs_alloc_debug) {
-               dev_alloc_debug_to_text(&out, ca);
-               return out.pos - buf;
-       }
+       if (attr == &sysfs_alloc_debug)
+               dev_alloc_debug_to_text(out, ca);
 
        return 0;
 }
@@ -900,6 +907,19 @@ STORE(bch2_dev)
                mutex_unlock(&c->sb_lock);
        }
 
+       if (attr == &sysfs_durability) {
+               u64 v = strtoul_or_return(buf);
+
+               mutex_lock(&c->sb_lock);
+               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+               if (v != BCH_MEMBER_DURABILITY(mi)) {
+                       SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+                       bch2_write_super(c);
+               }
+               mutex_unlock(&c->sb_lock);
+       }
+
        if (attr == &sysfs_label) {
                char *tmp;
                int ret;
@@ -914,9 +934,6 @@ STORE(bch2_dev)
                        return ret;
        }
 
-       if (attr == &sysfs_wake_allocator)
-               bch2_wake_allocator(ca);
-
        return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -942,11 +959,8 @@ struct attribute *bch2_dev_files[] = {
        &sysfs_io_latency_stats_write,
        &sysfs_congested,
 
-       &sysfs_reserve_stats,
-
        /* debug: */
        &sysfs_alloc_debug,
-       &sysfs_wake_allocator,
        NULL
 };
 
index 525fd05d91f7d003519e17a82e876e83157db9b3..222cd5062702cdd6a54335dcd058b95320867f8f 100644 (file)
@@ -10,28 +10,32 @@ struct attribute;
 struct sysfs_ops;
 
 extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
 extern struct attribute *bch2_fs_internal_files[];
 extern struct attribute *bch2_fs_opts_dir_files[];
 extern struct attribute *bch2_fs_time_stats_files[];
 extern struct attribute *bch2_dev_files[];
 
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
 
 int bch2_opts_create_sysfs_files(struct kobject *);
 
 #else
 
 static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
 static struct attribute *bch2_fs_internal_files[] = {};
 static struct attribute *bch2_fs_opts_dir_files[] = {};
 static struct attribute *bch2_fs_time_stats_files[] = {};
 static struct attribute *bch2_dev_files[] = {};
 
 static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
 static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
 static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
 static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
index de84ce83497598a867cdaa9cb737ef5743f8ca59..d058861811189433886cb16b482e6589604f3025 100644 (file)
@@ -15,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
        int ret;
 
        ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-                                     POS_MIN, SPOS_MAX,
-                                     BTREE_ITER_ALL_SNAPSHOTS,
+                                     SPOS(0, 0, U32_MAX), SPOS_MAX,
+                                     0,
                                      NULL);
        BUG_ON(ret);
 
        ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     POS_MIN, SPOS_MAX,
-                                     BTREE_ITER_ALL_SNAPSHOTS,
-                                     NULL);
+                                     SPOS(0, 0, U32_MAX), SPOS_MAX,
+                                     0, NULL);
        BUG_ON(ret);
 }
 
@@ -43,29 +42,29 @@ static int test_delete(struct bch_fs *c, u64 nr)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
                bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
-               bch_err(c, "update error in test_delete: %i", ret);
+               bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
                goto err;
        }
 
        pr_info("deleting once");
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
                bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
-               bch_err(c, "delete error (first) in test_delete: %i", ret);
+               bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
                goto err;
        }
 
        pr_info("deleting twice");
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
                bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
-               bch_err(c, "delete error (second) in test_delete: %i", ret);
+               bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
                goto err;
        }
 err:
@@ -89,22 +88,22 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
                bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
-               bch_err(c, "update error in test_delete_written: %i", ret);
+               bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
                goto err;
        }
 
        bch2_trans_unlock(&trans);
        bch2_journal_flush_all_pins(&c->journal);
 
-       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
                bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
-               bch_err(c, "delete error in test_delete_written: %i", ret);
+               bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
                goto err;
        }
 err:
@@ -137,7 +136,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
                ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
                                        NULL, NULL, 0);
                if (ret) {
-                       bch_err(c, "insert error in test_iterate: %i", ret);
+                       bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
                        goto err;
                }
        }
@@ -146,20 +145,30 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(0, 0, U32_MAX), 0, k, ret) {
-               if (k.k->p.inode)
-                       break;
-
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+                                 SPOS(0, 0, U32_MAX), 0, k, ({
                BUG_ON(k.k->p.offset != i++);
+               0;
+       }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
 
        BUG_ON(i != nr);
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
-               BUG_ON(k.k->p.offset != --i);
+       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+                                        SPOS(0, U64_MAX, U32_MAX), 0, k,
+               ({
+                       BUG_ON(k.k->p.offset != --i);
+                       0;
+               }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+               goto err;
+       }
 
        BUG_ON(i);
 err:
@@ -193,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
                ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                        NULL, NULL, 0);
                if (ret) {
-                       bch_err(c, "insert error in test_iterate_extents: %i", ret);
+                       bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
                        goto err;
                }
        }
@@ -202,19 +211,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          SPOS(0, 0, U32_MAX), 0, k, ret) {
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+                                 SPOS(0, 0, U32_MAX), 0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i);
                i = k.k->p.offset;
+               0;
+       }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
 
        BUG_ON(i != nr);
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
-               BUG_ON(k.k->p.offset != i);
-               i = bkey_start_offset(k.k);
+       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+                                        SPOS(0, U64_MAX, U32_MAX), 0, k,
+               ({
+                       BUG_ON(k.k->p.offset != i);
+                       i = bkey_start_offset(k.k);
+                       0;
+               }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
 
        BUG_ON(i);
@@ -248,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
                                        NULL, NULL, 0);
                if (ret) {
-                       bch_err(c, "insert error in test_iterate_slots: %i", ret);
+                       bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
                        goto err;
                }
        }
@@ -257,15 +278,16 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(0, 0, U32_MAX), 0, k, ret) {
-               if (k.k->p.inode)
-                       break;
-
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+                                 SPOS(0, 0, U32_MAX), 0, k, ({
                BUG_ON(k.k->p.offset != i);
                i += 2;
+               0;
+       }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr * 2);
 
@@ -273,17 +295,23 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(0, 0, U32_MAX),
-                          BTREE_ITER_SLOTS, k, ret) {
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+                                 SPOS(0, 0, U32_MAX),
+                                 BTREE_ITER_SLOTS, k, ({
+               if (i >= nr * 2)
+                       break;
+
                BUG_ON(k.k->p.offset != i);
                BUG_ON(bkey_deleted(k.k) != (i & 1));
 
                i++;
-               if (i == nr * 2)
-                       break;
+               0;
+       }));
+       if (ret < 0) {
+               bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = 0;
 err:
        bch2_trans_exit(&trans);
        return ret;
@@ -314,7 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                        NULL, NULL, 0);
                if (ret) {
-                       bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+                       bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
                        goto err;
                }
        }
@@ -323,13 +351,17 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          SPOS(0, 0, U32_MAX), 0, k, ret) {
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+                                 SPOS(0, 0, U32_MAX), 0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i + 8);
                BUG_ON(k.k->size != 8);
                i += 16;
+               0;
+       }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr);
 
@@ -337,19 +369,23 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          SPOS(0, 0, U32_MAX),
-                          BTREE_ITER_SLOTS, k, ret) {
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+                                SPOS(0, 0, U32_MAX),
+                                BTREE_ITER_SLOTS, k, ({
+               if (i == nr)
+                       break;
                BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
                BUG_ON(bkey_start_offset(k.k) != i);
                BUG_ON(k.k->size != 8);
                i = k.k->p.offset;
-
-               if (i == nr)
-                       break;
+               0;
+       }));
+       if (ret) {
+               bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+               goto err;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = 0;
 err:
        bch2_trans_exit(&trans);
        return 0;
@@ -369,10 +405,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
        BUG_ON(k.k);
 
        bch2_trans_iter_exit(&trans, &iter);
@@ -390,10 +426,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                             SPOS(0, 0, U32_MAX), 0);
 
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
        BUG_ON(k.k);
 
        bch2_trans_iter_exit(&trans, &iter);
@@ -420,7 +456,7 @@ static int insert_test_extent(struct bch_fs *c,
        ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                NULL, NULL, 0);
        if (ret)
-               bch_err(c, "insert error in insert_test_extent: %i", ret);
+               bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -483,7 +519,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
        bch2_trans_init(&trans, c, 0, 0);
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, snapid_lo), 0);
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 
        BUG_ON(k.k->p.snapshot != U32_MAX);
 
@@ -519,7 +555,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
        ret = test_snapshot_filter(c, snapids[0], snapids[1]);
        if (ret) {
-               bch_err(c, "err %i from test_snapshot_filter", ret);
+               bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
                return ret;
        }
 
@@ -553,10 +589,10 @@ static int rand_insert(struct bch_fs *c, u64 nr)
                k.k.p.offset = test_rand();
                k.k.p.snapshot = U32_MAX;
 
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = commit_do(&trans, NULL, NULL, 0,
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
                if (ret) {
-                       bch_err(c, "error in rand_insert: %i", ret);
+                       bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
                        break;
                }
        }
@@ -582,7 +618,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
                        k[j].k.p.snapshot = U32_MAX;
                }
 
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = commit_do(&trans, NULL, NULL, 0,
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
@@ -592,7 +628,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
                if (ret) {
-                       bch_err(c, "error in rand_insert_multi: %i", ret);
+                       bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
                        break;
                }
        }
@@ -616,10 +652,10 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
        for (i = 0; i < nr; i++) {
                bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-               k = bch2_btree_iter_peek(&iter);
+               lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
                ret = bkey_err(k);
                if (ret) {
-                       bch_err(c, "error in rand_lookup: %i", ret);
+                       bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
                        break;
                }
        }
@@ -641,8 +677,8 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
        k = bch2_btree_iter_peek(iter);
        ret = bkey_err(k);
-       if (ret && ret != -EINTR)
-               bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
        if (ret)
                return ret;
 
@@ -669,10 +705,10 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 
        for (i = 0; i < nr; i++) {
                rand = test_rand();
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = commit_do(&trans, NULL, NULL, 0,
                        rand_mixed_trans(&trans, &iter, &cookie, i, rand));
                if (ret) {
-                       bch_err(c, "update error in rand_mixed: %i", ret);
+                       bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
                        break;
                }
        }
@@ -690,7 +726,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
                             BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek(&iter);
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -715,10 +751,10 @@ static int rand_delete(struct bch_fs *c, u64 nr)
        for (i = 0; i < nr; i++) {
                struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = commit_do(&trans, NULL, NULL, 0,
                        __do_delete(&trans, pos));
                if (ret) {
-                       bch_err(c, "error in rand_delete: %i", ret);
+                       bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
                        break;
                }
        }
@@ -734,28 +770,23 @@ static int seq_insert(struct bch_fs *c, u64 nr)
        struct bkey_s_c k;
        struct bkey_i_cookie insert;
        int ret = 0;
-       u64 i = 0;
 
        bkey_cookie_init(&insert.k_i);
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
-                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               insert.k.p = iter.pos;
-
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(&iter) ?:
-                       bch2_trans_update(&trans, &iter, &insert.k_i, 0));
-               if (ret) {
-                       bch_err(c, "error in seq_insert: %i", ret);
-                       break;
-               }
-
-               if (++i == nr)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+                                       SPOS(0, 0, U32_MAX),
+                                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+                                       NULL, NULL, 0,
+               ({
+                       if (iter.pos.offset >= nr)
+                               break;
+                       insert.k.p = iter.pos;
+                       bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+               }));
+       if (ret)
+               bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
        bch2_trans_exit(&trans);
        return ret;
@@ -770,10 +801,11 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(0, 0, U32_MAX), 0, k, ret)
-               ;
-       bch2_trans_iter_exit(&trans, &iter);
+       ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+                                 SPOS(0, 0, U32_MAX), 0, k,
+               0);
+       if (ret)
+               bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
        bch2_trans_exit(&trans);
        return ret;
@@ -788,22 +820,18 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(0, 0, U32_MAX),
-                          BTREE_ITER_INTENT, k, ret) {
-               struct bkey_i_cookie u;
-
-               bkey_reassemble(&u.k_i, k);
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+                                       SPOS(0, 0, U32_MAX),
+                                       BTREE_ITER_INTENT, k,
+                                       NULL, NULL, 0,
+               ({
+                       struct bkey_i_cookie u;
 
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(&iter) ?:
-                       bch2_trans_update(&trans, &iter, &u.k_i, 0));
-               if (ret) {
-                       bch_err(c, "error in seq_overwrite: %i", ret);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+                       bkey_reassemble(&u.k_i, k);
+                       bch2_trans_update(&trans, &iter, &u.k_i, 0);
+               }));
+       if (ret)
+               bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
        bch2_trans_exit(&trans);
        return ret;
@@ -814,11 +842,10 @@ static int seq_delete(struct bch_fs *c, u64 nr)
        int ret;
 
        ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     POS_MIN, SPOS_MAX,
-                                     BTREE_ITER_ALL_SNAPSHOTS,
-                                     NULL);
+                                     SPOS(0, 0, U32_MAX), SPOS_MAX,
+                                     0, NULL);
        if (ret)
-               bch_err(c, "error in seq_delete: %i", ret);
+               bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -855,7 +882,7 @@ static int btree_perf_test_thread(void *data)
 
        ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
        if (ret) {
-               bch_err(j->c, "%ps: error %i", j->fn, ret);
+               bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
                j->ret = ret;
        }
 
@@ -871,7 +898,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
                         u64 nr, unsigned nr_threads)
 {
        struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-       char name_buf[20], nr_buf[20], per_sec_buf[20];
+       char name_buf[20];
+       struct printbuf nr_buf = PRINTBUF;
+       struct printbuf per_sec_buf = PRINTBUF;
        unsigned i;
        u64 time;
 
@@ -932,13 +961,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
        time = j.finish - j.start;
 
        scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-       bch2_hprint(&PBUF(nr_buf), nr);
-       bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
+       prt_human_readable_u64(&nr_buf, nr);
+       prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
        printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-               name_buf, nr_buf, nr_threads,
+               name_buf, nr_buf.buf, nr_threads,
                div_u64(time, NSEC_PER_SEC),
                div_u64(time * nr_threads, nr),
-               per_sec_buf);
+               per_sec_buf.buf);
+       printbuf_exit(&per_sec_buf);
+       printbuf_exit(&nr_buf);
        return j.ret;
 }
 
index 59e8dfa3d24520a2f57e5744efd3c4c456dab3a8..70573981b87dde70cd1267f2d0f539e5874c196a 100644 (file)
@@ -2,11 +2,13 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "buckets.h"
-#include "btree_types.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
 #include "keylist.h"
+#include "opts.h"
 
 #include <linux/blktrace_api.h>
-#include "keylist.h"
+#include <linux/six.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bcachefs.h>
index 0bbea332fcaaf7cecc6ce62695fe0367c011bab2..62fa662019ad9dc73eb4985b06f6f47e37d0fae3 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/freezer.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
 
 #include "eytzinger.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
 
-static int __bch2_strtoh(const char *cp, u64 *res,
-                        u64 t_max, bool t_signed)
+/* string_get_size units: */
+static const char *const units_2[] = {
+       "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+       "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
 {
-       bool positive = *cp != '-';
-       unsigned u;
+       const char *start = cp;
        u64 v = 0;
 
-       if (*cp == '+' || *cp == '-')
-               cp++;
-
        if (!isdigit(*cp))
                return -EINVAL;
 
@@ -50,22 +55,122 @@ static int __bch2_strtoh(const char *cp, u64 *res,
                cp++;
        } while (isdigit(*cp));
 
+       *res = v;
+       return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+       *res = 1;
+
+       while (p--) {
+               if (*res > div_u64(U64_MAX, n))
+                       return -ERANGE;
+               *res *= n;
+       }
+       return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+       const char *start = cp;
+       u64 base = 1024;
+       unsigned u;
+       int ret;
+
+       if (*cp == ' ')
+               cp++;
+
        for (u = 1; u < strlen(si_units); u++)
                if (*cp == si_units[u]) {
                        cp++;
                        goto got_unit;
                }
-       u = 0;
+
+       for (u = 0; u < ARRAY_SIZE(units_2); u++)
+               if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+                       cp += strlen(units_2[u]);
+                       goto got_unit;
+               }
+
+       for (u = 0; u < ARRAY_SIZE(units_10); u++)
+               if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+                       cp += strlen(units_10[u]);
+                       base = 1000;
+                       goto got_unit;
+               }
+
+       *res = 1;
+       return 0;
 got_unit:
-       if (*cp == '\n')
+       ret = bch2_pow(base, u, res);
+       if (ret)
+               return ret;
+
+       return cp - start;
+}
+
+#define parse_or_ret(cp, _f)                   \
+do {                                           \
+       int ret = _f;                           \
+       if (ret < 0)                            \
+               return ret;                     \
+       cp += ret;                              \
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+       const char *start = cp;
+       u64 v = 0, b, f_n = 0, f_d = 1;
+       int ret;
+
+       parse_or_ret(cp, parse_u64(cp, &v));
+
+       if (*cp == '.') {
                cp++;
-       if (*cp)
-               return -EINVAL;
+               ret = parse_u64(cp, &f_n);
+               if (ret < 0)
+                       return ret;
+               cp += ret;
+
+               ret = bch2_pow(10, ret, &f_d);
+               if (ret)
+                       return ret;
+       }
+
+       parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+       if (v > div_u64(U64_MAX, b))
+               return -ERANGE;
+       v *= b;
+
+       if (f_n > div_u64(U64_MAX, b))
+               return -ERANGE;
 
-       if (fls64(v) + u * 10 > 64)
+       f_n = div_u64(f_n * b, f_d);
+       if (v + f_n < v)
                return -ERANGE;
+       v += f_n;
 
-       v <<= u * 10;
+       *res = v;
+       return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+                        u64 t_max, bool t_signed)
+{
+       bool positive = *cp != '-';
+       u64 v = 0;
+
+       if (*cp == '+' || *cp == '-')
+               cp++;
+
+       parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+       if (*cp == '\n')
+               cp++;
+       if (*cp)
+               return -EINVAL;
 
        if (positive) {
                if (v > t_max)
@@ -86,7 +191,7 @@ got_unit:
 #define STRTO_H(name, type)                                    \
 int bch2_ ## name ## _h(const char *cp, type *res)             \
 {                                                              \
-       u64 v;                                                  \
+       u64 v = 0;                                              \
        int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),      \
                        ANYSINT_MAX(type) != ((type) ~0ULL));   \
        *res = v;                                               \
@@ -99,58 +204,6 @@ STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 STRTO_H(strtou64, u64)
 
-void bch2_hprint(struct printbuf *buf, s64 v)
-{
-       int u, t = 0;
-
-       for (u = 0; v >= 1024 || v <= -1024; u++) {
-               t = v & ~(~0U << 10);
-               v >>= 10;
-       }
-
-       pr_buf(buf, "%lli", v);
-
-       /*
-        * 103 is magic: t is in the range [-1023, 1023] and we want
-        * to turn it into [-9, 9]
-        */
-       if (u && t && v < 100 && v > -100)
-               pr_buf(buf, ".%i", t / 103);
-       if (u)
-               pr_buf(buf, "%c", si_units[u]);
-}
-
-void bch2_string_opt_to_text(struct printbuf *out,
-                            const char * const list[],
-                            size_t selected)
-{
-       size_t i;
-
-       for (i = 0; list[i]; i++)
-               pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_flags_to_text(struct printbuf *out,
-                       const char * const list[], u64 flags)
-{
-       unsigned bit, nr = 0;
-       bool first = true;
-
-       if (out->pos != out->end)
-               *out->pos = '\0';
-
-       while (list[nr])
-               nr++;
-
-       while (flags && (bit = __ffs(flags)) < nr) {
-               if (!first)
-                       pr_buf(out, ",");
-               first = false;
-               pr_buf(out, "%s", list[bit]);
-               flags ^= 1 << bit;
-       }
-}
-
 u64 bch2_read_flag_list(char *opt, const char * const list[])
 {
        u64 ret = 0;
@@ -217,45 +270,98 @@ static void bch2_quantiles_update(struct quantiles *q, u64 v)
        }
 }
 
-/* time stats: */
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+       while (nr_bits)
+               prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
 
-static void bch2_time_stats_update_one(struct time_stats *stats,
-                                      u64 start, u64 end)
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
 {
-       u64 duration, freq;
+       const char *p;
 
-       duration        = time_after64(end, start)
-               ? end - start : 0;
-       freq            = time_after64(end, stats->last_event)
-               ? end - stats->last_event : 0;
+       if (!lines) {
+               printk("%s (null)\n", prefix);
+               return;
+       }
 
-       stats->count++;
+       console_lock();
+       while (1) {
+               p = strchrnul(lines, '\n');
+               printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+               if (!*p)
+                       break;
+               lines = p + 1;
+               prefix = KERN_CONT;
+       }
+       console_unlock();
+}
 
-       stats->average_duration = stats->average_duration
-               ? ewma_add(stats->average_duration, duration, 6)
-               : duration;
+int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
+{
+       unsigned long entries[32];
+       unsigned i, nr_entries;
+       int ret;
+
+       ret = down_read_killable(&task->signal->exec_update_lock);
+       if (ret)
+               return ret;
+
+       nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
+       for (i = 0; i < nr_entries; i++) {
+               prt_printf(out, "[<0>] %pB", (void *)entries[i]);
+               prt_newline(out);
+       }
+
+       up_read(&task->signal->exec_update_lock);
+       return 0;
+}
 
-       stats->average_frequency = stats->average_frequency
-               ? ewma_add(stats->average_frequency, freq, 6)
-               : freq;
+/* time stats: */
 
-       stats->max_duration = max(stats->max_duration, duration);
+static void bch2_time_stats_update_one(struct time_stats *stats,
+                                      u64 start, u64 end)
+{
+       u64 duration, freq;
 
-       stats->last_event = end;
+       if (time_after64(end, start)) {
+               duration = end - start;
+               stats->duration_stats = mean_and_variance_update(stats->duration_stats,
+                                                                duration);
+               stats->duration_stats_weighted = mean_and_variance_weighted_update(
+                       stats->duration_stats_weighted,
+                       duration);
+               stats->max_duration = max(stats->max_duration, duration);
+               stats->min_duration = min(stats->min_duration, duration);
+               bch2_quantiles_update(&stats->quantiles, duration);
+       }
 
-       bch2_quantiles_update(&stats->quantiles, duration);
+       if (time_after64(end, stats->last_event)) {
+               freq = end - stats->last_event;
+               stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+               stats->freq_stats_weighted = mean_and_variance_weighted_update(
+                       stats->freq_stats_weighted,
+                       freq);
+               stats->max_freq = max(stats->max_freq, freq);
+               stats->min_freq = min(stats->min_freq, freq);
+               stats->last_event = end;
+       }
 }
 
 void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
 {
        unsigned long flags;
 
+       WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+                      "time_stats: min_duration = %llu, min_freq = %llu",
+                      stats->min_duration, stats->min_freq);
+
        if (!stats->buffer) {
                spin_lock_irqsave(&stats->lock, flags);
                bch2_time_stats_update_one(stats, start, end);
 
-               if (stats->average_frequency < 32 &&
-                   stats->count > 1024)
+               if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+                   stats->duration_stats.n > 1024)
                        stats->buffer =
                                alloc_percpu_gfp(struct time_stat_buffer,
                                                 GFP_ATOMIC);
@@ -290,12 +396,15 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
 
 static const struct time_unit {
        const char      *name;
-       u32             nsecs;
+       u64             nsecs;
 } time_units[] = {
-       { "ns",         1               },
-       { "us",         NSEC_PER_USEC   },
-       { "ms",         NSEC_PER_MSEC   },
-       { "sec",        NSEC_PER_SEC    },
+       { "ns",         1                },
+       { "us",         NSEC_PER_USEC    },
+       { "ms",         NSEC_PER_MSEC    },
+       { "s",          NSEC_PER_SEC     },
+       { "m",          NSEC_PER_SEC * 60},
+       { "h",          NSEC_PER_SEC * 3600},
+       { "eon",        U64_MAX          },
 };
 
 static const struct time_unit *pick_time_units(u64 ns)
@@ -315,41 +424,126 @@ static void pr_time_units(struct printbuf *out, u64 ns)
 {
        const struct time_unit *u = pick_time_units(ns);
 
-       pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+       prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+       prt_tab_rjust(out);
+       prt_printf(out, "%s", u->name);
+}
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+       prt_str(out, name);
+       prt_tab(out);
+       pr_time_units(out, ns);
+       prt_newline(out);
 }
 
 void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 {
        const struct time_unit *u;
-       u64 freq = READ_ONCE(stats->average_frequency);
-       u64 q, last_q = 0;
+       s64 f_mean = 0, d_mean = 0;
+       u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
        int i;
+       /*
+        * avoid divide by zero
+        */
+       if (stats->freq_stats.n) {
+               f_mean = mean_and_variance_get_mean(stats->freq_stats);
+               f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+               d_mean = mean_and_variance_get_mean(stats->duration_stats);
+               d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+       }
 
-       pr_buf(out, "count:\t\t%llu\n",
-                        stats->count);
-       pr_buf(out, "rate:\t\t%llu/sec\n",
-              freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
-
-       pr_buf(out, "frequency:\t");
-       pr_time_units(out, freq);
-
-       pr_buf(out, "\navg duration:\t");
-       pr_time_units(out, stats->average_duration);
-
-       pr_buf(out, "\nmax duration:\t");
-       pr_time_units(out, stats->max_duration);
+       printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+       prt_printf(out, "count:");
+       prt_tab(out);
+       prt_printf(out, "%llu ",
+                        stats->duration_stats.n);
+       printbuf_tabstop_pop(out);
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
+
+       printbuf_tabstop_push(out, out->indent + 20);
+       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+       printbuf_tabstop_push(out, 0);
+       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+       prt_tab(out);
+       prt_printf(out, "since mount");
+       prt_tab_rjust(out);
+       prt_tab(out);
+       prt_printf(out, "recent");
+       prt_tab_rjust(out);
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, out->indent + 20);
+       printbuf_tabstop_push(out, TABSTOP_SIZE);
+       printbuf_tabstop_push(out, 2);
+       printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+       prt_printf(out, "duration of events");
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       pr_name_and_units(out, "min:", stats->min_duration);
+       pr_name_and_units(out, "max:", stats->max_duration);
+
+       prt_printf(out, "mean:");
+       prt_tab(out);
+       pr_time_units(out, d_mean);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+       prt_newline(out);
+
+       prt_printf(out, "stddev:");
+       prt_tab(out);
+       pr_time_units(out, d_stddev);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+       printbuf_indent_sub(out, 2);
+       prt_newline(out);
+
+       prt_printf(out, "time between events");
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       pr_name_and_units(out, "min:", stats->min_freq);
+       pr_name_and_units(out, "max:", stats->max_freq);
+
+       prt_printf(out, "mean:");
+       prt_tab(out);
+       pr_time_units(out, f_mean);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+       prt_newline(out);
+
+       prt_printf(out, "stddev:");
+       prt_tab(out);
+       pr_time_units(out, f_stddev);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+       printbuf_indent_sub(out, 2);
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
 
        i = eytzinger0_first(NR_QUANTILES);
        u = pick_time_units(stats->quantiles.entries[i].m);
 
-       pr_buf(out, "\nquantiles (%s):\t", u->name);
+       prt_printf(out, "quantiles (%s):\t", u->name);
        eytzinger0_for_each(i, NR_QUANTILES) {
                bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
                q = max(stats->quantiles.entries[i].m, last_q);
-               pr_buf(out, "%llu%s",
-                      div_u64(q, u->nsecs),
-                      is_last ? "\n" : " ");
+               prt_printf(out, "%llu ",
+                      div_u64(q, u->nsecs));
+               if (is_last)
+                       prt_newline(out);
                last_q = q;
        }
 }
@@ -362,6 +556,10 @@ void bch2_time_stats_exit(struct time_stats *stats)
 void bch2_time_stats_init(struct time_stats *stats)
 {
        memset(stats, 0, sizeof(*stats));
+       stats->duration_stats_weighted.w = 8;
+       stats->freq_stats_weighted.w = 8;
+       stats->min_duration = U64_MAX;
+       stats->min_freq = U64_MAX;
        spin_lock_init(&stats->lock);
 }
 
@@ -467,36 +665,45 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
        pd->backpressure        = 1;
 }
 
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
 {
-       /* 2^64 - 1 is 20 digits, plus null byte */
-       char rate[21];
-       char actual[21];
-       char target[21];
-       char proportional[21];
-       char derivative[21];
-       char change[21];
-       s64 next_io;
-
-       bch2_hprint(&PBUF(rate),        pd->rate.rate);
-       bch2_hprint(&PBUF(actual),      pd->last_actual);
-       bch2_hprint(&PBUF(target),      pd->last_target);
-       bch2_hprint(&PBUF(proportional), pd->last_proportional);
-       bch2_hprint(&PBUF(derivative),  pd->last_derivative);
-       bch2_hprint(&PBUF(change),      pd->last_change);
-
-       next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
-
-       return sprintf(buf,
-                      "rate:\t\t%s/sec\n"
-                      "target:\t\t%s\n"
-                      "actual:\t\t%s\n"
-                      "proportional:\t%s\n"
-                      "derivative:\t%s\n"
-                      "change:\t\t%s/sec\n"
-                      "next io:\t%llims\n",
-                      rate, target, actual, proportional,
-                      derivative, change, next_io);
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 20);
+
+       prt_printf(out, "rate:");
+       prt_tab(out);
+       prt_human_readable_s64(out, pd->rate.rate);
+       prt_newline(out);
+
+       prt_printf(out, "target:");
+       prt_tab(out);
+       prt_human_readable_u64(out, pd->last_target);
+       prt_newline(out);
+
+       prt_printf(out, "actual:");
+       prt_tab(out);
+       prt_human_readable_u64(out, pd->last_actual);
+       prt_newline(out);
+
+       prt_printf(out, "proportional:");
+       prt_tab(out);
+       prt_human_readable_s64(out, pd->last_proportional);
+       prt_newline(out);
+
+       prt_printf(out, "derivative:");
+       prt_tab(out);
+       prt_human_readable_s64(out, pd->last_derivative);
+       prt_newline(out);
+
+       prt_printf(out, "change:");
+       prt_tab(out);
+       prt_human_readable_s64(out, pd->last_change);
+       prt_newline(out);
+
+       prt_printf(out, "next io:");
+       prt_tab(out);
+       prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+       prt_newline(out);
 }
 
 /* misc: */
@@ -579,21 +786,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
        }
 }
 
-void bch_scnmemcpy(struct printbuf *out,
-                  const char *src, size_t len)
-{
-       size_t n = printbuf_remaining(out);
-
-       if (n) {
-               n = min(n - 1, len);
-               memcpy(out->pos, src, n);
-               out->pos += n;
-               *out->pos = '\0';
-       }
-}
-
-#include "eytzinger.h"
-
 static int alignment_ok(const void *base, size_t align)
 {
        return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
index e55407dc53249d1a93106cd9e4b37c9df74b523e..846e6024a80b377c2eb81b92e2209d2166941123 100644 (file)
 #include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
+#include <linux/printbuf.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#include <linux/mean_and_variance.h>
 
 struct closure;
 
@@ -210,9 +212,11 @@ do {                                                                       \
                                                                        \
        BUG_ON(_i >= (h)->used);                                        \
        (h)->used--;                                                    \
-       heap_swap(h, _i, (h)->used, set_backpointer);                   \
-       heap_sift_up(h, _i, cmp, set_backpointer);                      \
-       heap_sift_down(h, _i, cmp, set_backpointer);                    \
+       if ((_i) < (h)->used) {                                         \
+               heap_swap(h, _i, (h)->used, set_backpointer);           \
+               heap_sift_up(h, _i, cmp, set_backpointer);              \
+               heap_sift_down(h, _i, cmp, set_backpointer);            \
+       }                                                               \
 } while (0)
 
 #define heap_pop(h, d, cmp, set_backpointer)                           \
@@ -235,54 +239,44 @@ do {                                                                      \
 #define ANYSINT_MAX(t)                                                 \
        ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
-struct printbuf {
-       char            *pos;
-       char            *end;
-       unsigned        indent;
-};
 
-static inline size_t printbuf_remaining(struct printbuf *buf)
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
 {
-       return buf->end - buf->pos;
+       prt_printf(out, "%llu", time);
 }
-
-#define _PBUF(_buf, _len)                                              \
-       ((struct printbuf) {                                            \
-               .pos    = _buf,                                         \
-               .end    = _buf + _len,                                  \
-       })
-
-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
-
-#define pr_buf(_out, ...)                                              \
-do {                                                                   \
-       (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \
-                                __VA_ARGS__);                          \
-} while (0)
-
-static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
-{
-       buf->indent += spaces;
-       while (spaces--)
-               pr_buf(buf, " ");
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+       char time_str[64];
+       time_t time = _time;
+       struct tm *tm = localtime(&time);
+       size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+       if (!err)
+               prt_printf(out, "(formatting error)");
+       else
+               prt_printf(out, "%s", time_str);
 }
+#endif
 
-static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
 {
-       buf->indent -= spaces;
+       sprintf(out, "%pUb", uuid);
 }
+#else
+#include <uuid/uuid.h>
+#endif
 
-static inline void printbuf_newline(struct printbuf *buf)
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
 {
-       unsigned i;
+       char uuid_str[40];
 
-       pr_buf(buf, "\n");
-       for (i = 0; i < buf->indent; i++)
-               pr_buf(buf, " ");
+       uuid_unparse_lower(uuid, uuid_str);
+       prt_printf(out, "%s", uuid_str);
 }
 
-void bch_scnmemcpy(struct printbuf *, const char *, size_t);
-
 int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
@@ -345,8 +339,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
        _r;                                                             \
 })
 
-#define snprint(buf, size, var)                                                \
-       snprintf(buf, size,                                             \
+#define snprint(out, var)                                              \
+       prt_printf(out,                                                 \
                   type_is(var, int)            ? "%i\n"                \
                 : type_is(var, unsigned)       ? "%u\n"                \
                 : type_is(var, long)           ? "%li\n"               \
@@ -356,16 +350,15 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
                 : type_is(var, char *)         ? "%s\n"                \
                 : "%i\n", var)
 
-void bch2_hprint(struct printbuf *, s64);
-
 bool bch2_is_zero(const void *, size_t);
 
-void bch2_string_opt_to_text(struct printbuf *,
-                            const char * const [], size_t);
-
-void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
 u64 bch2_read_flag_list(char *, const char * const[]);
 
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
+
 #define NR_QUANTILES   15
 #define QUANTILE_IDX(i)        inorder_to_eytzinger0(i, NR_QUANTILES)
 #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
@@ -388,14 +381,18 @@ struct time_stat_buffer {
 
 struct time_stats {
        spinlock_t      lock;
-       u64             count;
        /* all fields are in nanoseconds */
-       u64             average_duration;
-       u64             average_frequency;
        u64             max_duration;
+       u64             min_duration;
+       u64             max_freq;
+       u64             min_freq;
        u64             last_event;
        struct quantiles quantiles;
 
+       struct mean_and_variance          duration_stats;
+       struct mean_and_variance_weighted duration_stats_weighted;
+       struct mean_and_variance          freq_stats;
+       struct mean_and_variance_weighted freq_stats_weighted;
        struct time_stat_buffer __percpu *buffer;
 };
 
@@ -463,7 +460,7 @@ struct bch_pd_controller {
 
 void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
 void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
 
 #define sysfs_pd_controller_attribute(name)                            \
        rw_attribute(name##_rate);                                      \
@@ -487,7 +484,7 @@ do {                                                                        \
        sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
                                                                        \
        if (attr == &sysfs_##name##_rate_debug)                         \
-               return bch2_pd_controller_print_debug(var, buf);                \
+               bch2_pd_controller_debug_to_text(out, var);             \
 } while (0)
 
 #define sysfs_pd_controller_store(name, var)                           \
@@ -700,6 +697,31 @@ do {                                                                       \
 #define array_remove_item(_array, _nr, _pos)                           \
        array_remove_items(_array, _nr, _pos, 1)
 
+static inline void __move_gap(void *array, size_t element_size,
+                             size_t nr, size_t size,
+                             size_t old_gap, size_t new_gap)
+{
+       size_t gap_end = old_gap + size - nr;
+
+       if (new_gap < old_gap) {
+               size_t move = old_gap - new_gap;
+
+               memmove(array + element_size * (gap_end - move),
+                       array + element_size * (old_gap - move),
+                               element_size * move);
+       } else if (new_gap > old_gap) {
+               size_t move = new_gap - old_gap;
+
+               memmove(array + element_size * old_gap,
+                       array + element_size * gap_end,
+                               element_size * move);
+       }
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap)       \
+       __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
 #define bubble_sort(_base, _nr, _cmp)                                  \
 do {                                                                   \
        ssize_t _i, _end;                                               \
@@ -768,13 +790,4 @@ static inline int u8_cmp(u8 l, u8 r)
        return cmp_int(l, r);
 }
 
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
-       sprintf(out, "%plU", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
 #endif /* _BCACHEFS_UTIL_H */
index a2d6bb7136c7d412d95469ac09b2c063fd0dd86d..5143b603bf67ff397181e70a9705cfa100237fbf 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/bitops.h>
+#include <linux/math.h>
 #include <linux/string.h>
 #include <asm/unaligned.h>
 
index c099cdc0605f960c7d2ebae01304d0722194ee05..53a694d71967196ad2784f89da5ea5c3966644a1 100644 (file)
@@ -20,7 +20,7 @@
 ({                                                                     \
        BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));             \
                                                                        \
-       (offsetof(_type, _data) + (_u64s) * sizeof(u64));               \
+       (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));      \
 })
 
 #define vstruct_bytes(_s)                                              \
index 4d7db64e3ef3085602c7017618b411f281a06b6b..4fc1c3afab691065f4d6e3cc83394acdb6ec7a71 100644 (file)
@@ -69,32 +69,51 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
        .cmp_bkey       = xattr_cmp_bkey,
 };
 
-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                      int rw, struct printbuf *err)
 {
        const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-       if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
-               return "value too small";
+       if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
+               prt_printf(err, "incorrect value size (%zu < %zu)",
+                      bkey_val_bytes(k.k), sizeof(*xattr.v));
+               return -EINVAL;
+       }
 
        if (bkey_val_u64s(k.k) <
            xattr_val_u64s(xattr.v->x_name_len,
-                          le16_to_cpu(xattr.v->x_val_len)))
-               return "value too small";
+                          le16_to_cpu(xattr.v->x_val_len))) {
+               prt_printf(err, "value too small (%zu < %u)",
+                      bkey_val_u64s(k.k),
+                      xattr_val_u64s(xattr.v->x_name_len,
+                                     le16_to_cpu(xattr.v->x_val_len)));
+               return -EINVAL;
+       }
 
+       /* XXX why +4 ? */
        if (bkey_val_u64s(k.k) >
            xattr_val_u64s(xattr.v->x_name_len,
-                          le16_to_cpu(xattr.v->x_val_len) + 4))
-               return "value too big";
+                          le16_to_cpu(xattr.v->x_val_len) + 4)) {
+               prt_printf(err, "value too big (%zu > %u)",
+                      bkey_val_u64s(k.k),
+                      xattr_val_u64s(xattr.v->x_name_len,
+                                     le16_to_cpu(xattr.v->x_val_len) + 4));
+               return -EINVAL;
+       }
 
        handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-       if (!handler)
-               return "invalid type";
+       if (!handler) {
+               prt_printf(err, "invalid type (%u)", xattr.v->x_type);
+               return -EINVAL;
+       }
 
-       if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
-               return "xattr name has invalid characters";
+       if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
+               prt_printf(err, "xattr name has invalid characters");
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -105,17 +124,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 
        handler = bch2_xattr_type_to_handler(xattr.v->x_type);
        if (handler && handler->prefix)
-               pr_buf(out, "%s", handler->prefix);
+               prt_printf(out, "%s", handler->prefix);
        else if (handler)
-               pr_buf(out, "(type %u)", xattr.v->x_type);
+               prt_printf(out, "(type %u)", xattr.v->x_type);
        else
-               pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+               prt_printf(out, "(unknown type %u)", xattr.v->x_type);
 
-       bch_scnmemcpy(out, xattr.v->x_name,
-                     xattr.v->x_name_len);
-       pr_buf(out, ":");
-       bch_scnmemcpy(out, xattr_val(xattr.v),
-                     le16_to_cpu(xattr.v->x_val_len));
+       prt_printf(out, "%.*s:%.*s",
+              xattr.v->x_name_len,
+              xattr.v->x_name,
+              le16_to_cpu(xattr.v->x_val_len),
+              (char *) xattr_val(xattr.v));
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -311,13 +330,9 @@ retry:
        if (ret)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
-                          SPOS(inum, offset, snapshot), 0, k, ret) {
-               BUG_ON(k.k->p.inode < inum);
-
-               if (k.k->p.inode > inum)
-                       break;
-
+       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(inum, offset, snapshot),
+                          POS(inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_xattr)
                        continue;
 
@@ -329,23 +344,25 @@ retry:
        offset = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
 
        if (ret)
-               return ret;
+               goto out;
 
        ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
        if (ret)
-               return ret;
+               goto out;
 
        ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
        if (ret)
-               return ret;
+               goto out;
 
        return buf.used;
+out:
+       return bch2_err_class(ret);
 }
 
 static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -354,8 +371,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret;
 
-       return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+       ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+       return bch2_err_class(ret);
 }
 
 static int bch2_xattr_set_handler(const struct xattr_handler *handler,
@@ -367,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+       int ret;
 
-       return bch2_trans_do(c, NULL, NULL, 0,
+       ret = bch2_trans_do(c, NULL, NULL, 0,
                        bch2_xattr_set(&trans, inode_inum(inode), &hash,
                                       name, value, size,
                                       handler->flags, flags));
+       return bch2_err_class(ret);
 }
 
 static const struct xattr_handler bch_xattr_user_handler = {
@@ -426,9 +447,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
                bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
        const struct bch_option *opt;
        int id, inode_opt_id;
-       char buf[512];
-       struct printbuf out = PBUF(buf);
-       unsigned val_len;
+       struct printbuf out = PRINTBUF;
+       int ret;
        u64 v;
 
        id = bch2_opt_lookup(name);
@@ -449,16 +469,21 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
                return -ENODATA;
 
        v = bch2_opt_get_by_id(&opts, id);
-       bch2_opt_to_text(&out, c, opt, v, 0);
+       bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
 
-       val_len = out.pos - buf;
+       ret = out.pos;
 
-       if (buffer && val_len > size)
-               return -ERANGE;
+       if (out.allocation_failure) {
+               ret = -ENOMEM;
+       } else if (buffer) {
+               if (out.pos > size)
+                       ret = -ERANGE;
+               else
+                       memcpy(buffer, out.buf, out.pos);
+       }
 
-       if (buffer)
-               memcpy(buffer, buf, val_len);
-       return val_len;
+       printbuf_exit(&out);
+       return ret;
 }
 
 static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
@@ -525,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                memcpy(buf, value, size);
                buf[size] = '\0';
 
-               ret = bch2_opt_parse(c, NULL, opt, buf, &v);
+               ret = bch2_opt_parse(c, opt, buf, &v, NULL);
                kfree(buf);
 
                if (ret < 0)
index f4f896545e1c29f0ff35018263bf6b227250567b..66d7a1e30350e30875d3e1597675ddf264a9c8e6 100644 (file)
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr (struct bkey_ops) {                \
index 8422c2625dec5a82e0dc4cce1afd72386fec896d..93a791c4bb98a205b149ff888d6883e7eaa3eeb0 100644 (file)
@@ -120,29 +120,30 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
        }
 }
 
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 {
-       /*
-        * most users will be overriding ->bi_bdev with a new target,
-        * so we don't set nor calculate new physical/hw segment counts here
-        */
-       bio->bi_bdev = bio_src->bi_bdev;
        bio_set_flag(bio, BIO_CLONED);
-       bio->bi_opf = bio_src->bi_opf;
+       bio->bi_ioprio = bio_src->bi_ioprio;
        bio->bi_iter = bio_src->bi_iter;
-       bio->bi_io_vec = bio_src->bi_io_vec;
+       return 0;
 }
 
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+               gfp_t gfp, struct bio_set *bs)
 {
-       struct bio *b;
+       struct bio *bio;
+
+       bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+       if (!bio)
+               return NULL;
 
-       b = bio_alloc_bioset(gfp_mask, 0, bs);
-       if (!b)
+       if (__bio_clone(bio, bio_src, gfp) < 0) {
+               bio_put(bio);
                return NULL;
+       }
+       bio->bi_io_vec = bio_src->bi_io_vec;
 
-       __bio_clone_fast(b, bio);
-       return b;
+       return bio;
 }
 
 struct bio *bio_split(struct bio *bio, int sectors,
@@ -153,15 +154,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
        BUG_ON(sectors <= 0);
        BUG_ON(sectors >= bio_sectors(bio));
 
-       /*
-        * Discards need a mutable bio_vec to accommodate the payload
-        * required by the DSM TRIM and UNMAP commands.
-        */
-       if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-               split = bio_clone_bioset(bio, gfp, bs);
-       else
-               split = bio_clone_fast(bio, gfp, bs);
-
+       split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
        if (!split)
                return NULL;
 
@@ -188,9 +181,16 @@ void bio_advance(struct bio *bio, unsigned bytes)
 
 static void bio_free(struct bio *bio)
 {
-       unsigned front_pad = bio->bi_pool ? bio->bi_pool->front_pad : 0;
+       struct bio_set *bs = bio->bi_pool;
+
+       if (bs) {
+               if (bio->bi_max_vecs > BIO_INLINE_VECS)
+                       mempool_free(bio->bi_io_vec, &bs->bvec_pool);
 
-       kfree((void *) bio - front_pad);
+               mempool_free((void *) bio - bs->front_pad, &bs->bio_pool);
+       } else {
+               kfree(bio);
+       }
 }
 
 void bio_put(struct bio *bio)
@@ -282,64 +282,114 @@ again:
                bio->bi_end_io(bio);
 }
 
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
 {
        unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
 
        memset(bio, 0, BIO_RESET_BYTES);
-       bio->bi_flags = flags;
+       bio->bi_bdev    = bdev;
+       bio->bi_opf     = opf;
+       bio->bi_flags   = flags;
        atomic_set(&bio->__bi_remaining, 1);
 }
 
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask)
 {
-       unsigned front_pad = bs ? bs->front_pad : 0;
        struct bio *bio;
-       void *p;
-
-       p = kmalloc(front_pad +
-                   sizeof(struct bio) +
-                   nr_iovecs * sizeof(struct bio_vec),
-                   gfp_mask);
 
-       if (unlikely(!p))
+       bio = kmalloc(sizeof(struct bio) +
+                     sizeof(struct bio_vec) * nr_iovecs, gfp_mask);
+       if (unlikely(!bio))
                return NULL;
+       bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, 0);
+       bio->bi_pool = NULL;
+       return bio;
+}
 
-       bio = p + front_pad;
-       bio_init(bio, bio->bi_inline_vecs, nr_iovecs);
-       bio->bi_pool = bs;
+static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs,
+               gfp_t gfp_mask)
+{
+       *nr_vecs = roundup_pow_of_two(*nr_vecs);
+       /*
+        * Try a slab allocation first for all smaller allocations.  If that
+        * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
+        * The mempool is sized to handle up to BIO_MAX_VECS entries.
+        */
+       if (*nr_vecs < BIO_MAX_VECS) {
+               struct bio_vec *bvl;
 
-       return bio;
+               bvl = kmalloc(sizeof(*bvl) * *nr_vecs, gfp_mask);
+               if (likely(bvl))
+                       return bvl;
+               *nr_vecs = BIO_MAX_VECS;
+       }
+
+       return mempool_alloc(pool, gfp_mask);
 }
 
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+struct bio *bio_alloc_bioset(struct block_device *bdev,
+                            unsigned nr_iovecs,
+                            unsigned opf,
+                            gfp_t gfp_mask,
                             struct bio_set *bs)
 {
-       struct bvec_iter iter;
-       struct bio_vec bv;
        struct bio *bio;
+       void *p;
 
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-       if (!bio)
+       if (nr_iovecs > BIO_MAX_VECS)
+               return NULL;
+
+       p = mempool_alloc(&bs->bio_pool, gfp_mask);
+       if (unlikely(!p))
                return NULL;
 
-       bio->bi_bdev            = bio_src->bi_bdev;
-       bio->bi_opf             = bio_src->bi_opf;
-       bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
-       bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
-
-       switch (bio_op(bio)) {
-       case REQ_OP_DISCARD:
-       case REQ_OP_SECURE_ERASE:
-               break;
-       case REQ_OP_WRITE_SAME:
-               bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-               break;
-       default:
-               bio_for_each_segment(bv, bio_src, iter)
-                       bio->bi_io_vec[bio->bi_vcnt++] = bv;
-               break;
+       bio = p + bs->front_pad;
+       if (nr_iovecs > BIO_INLINE_VECS) {
+               struct bio_vec *bvl = NULL;
+
+               bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+               if (unlikely(!bvl))
+                       goto err_free;
+
+               bio_init(bio, bdev, bvl, nr_iovecs, opf);
+       } else if (nr_iovecs) {
+               bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+       } else {
+               bio_init(bio, bdev, NULL, 0, opf);
        }
 
+       bio->bi_pool = bs;
        return bio;
+
+err_free:
+       mempool_free(p, &bs->bio_pool);
+       return NULL;
+}
+
+void bioset_exit(struct bio_set *bs)
+{
+       mempool_exit(&bs->bio_pool);
+       mempool_exit(&bs->bvec_pool);
+}
+
+int bioset_init(struct bio_set *bs,
+               unsigned int pool_size,
+               unsigned int front_pad,
+               int flags)
+{
+       int ret;
+
+       bs->front_pad = front_pad;
+       if (flags & BIOSET_NEED_BVECS)
+               bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+       else
+               bs->back_pad = 0;
+
+       ret   = mempool_init_kmalloc_pool(&bs->bio_pool, pool_size, bs->front_pad +
+                                         sizeof(struct bio) + bs->back_pad) ?:
+               mempool_init_kmalloc_pool(&bs->bvec_pool, pool_size,
+                                         sizeof(struct bio_vec) * BIO_MAX_VECS);
+       if (ret)
+               bioset_exit(bs);
+       return ret;
 }
index 762e5aa02a530e9a8b63e3da8da5227d87198f53..9b3ea93f8c19aecdaa0a6c9bb3e12eda2def00bf 100644 (file)
@@ -113,7 +113,7 @@ int submit_bio_wait(struct bio *bio)
 
 int blkdev_issue_discard(struct block_device *bdev,
                         sector_t sector, sector_t nr_sects,
-                        gfp_t gfp_mask, unsigned long flags)
+                        gfp_t gfp_mask)
 {
        return 0;
 }
@@ -128,12 +128,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev)
        BUG_ON(ret);
 
        if (!S_ISBLK(statbuf.st_mode))
-               return statbuf.st_blksize >> 9;
+               return statbuf.st_blksize;
 
-       ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
-       BUG_ON(ret);
-
-       return blksize >> 9;
+       xioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
+       return blksize;
 }
 
 sector_t get_capacity(struct gendisk *disk)
@@ -168,7 +166,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                                        void *holder)
 {
        struct block_device *bdev;
-       int fd, sync_fd, flags = O_DIRECT;
+       int fd, sync_fd, buffered_fd, flags = 0;
 
        if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
                flags = O_RDWR;
@@ -183,16 +181,12 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                flags |= O_EXCL;
 #endif
 
-       fd = open(path, flags);
+       fd = open(path, flags|O_DIRECT);
        if (fd < 0)
                return ERR_PTR(-errno);
 
-       sync_fd = open(path, flags|O_SYNC);
-       if (sync_fd < 0) {
-               assert(0);
-               close(fd);
-               return ERR_PTR(-errno);
-       }
+       sync_fd = xopen(path, flags|O_DIRECT|O_SYNC);
+       buffered_fd = xopen(path, flags);
 
        bdev = malloc(sizeof(*bdev));
        memset(bdev, 0, sizeof(*bdev));
@@ -203,6 +197,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        bdev->bd_dev            = xfstat(fd).st_rdev;
        bdev->bd_fd             = fd;
        bdev->bd_sync_fd        = sync_fd;
+       bdev->bd_buffered_fd    = buffered_fd;
        bdev->bd_holder         = holder;
        bdev->bd_disk           = &bdev->__bd_disk;
        bdev->bd_disk->bdi      = &bdev->bd_disk->__bdi;
index 7857017c1d48a11c6ceab7e4069af4908b46a415..41f1bcdc44886c1cef8f25c080c006d22a45339d 100644 (file)
@@ -3,6 +3,7 @@
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
+#include <linux/kmemleak.h>
 
 #define GENRADIX_ARY           (PAGE_SIZE / sizeof(struct genradix_node *))
 #define GENRADIX_ARY_SHIFT     ilog2(GENRADIX_ARY)
@@ -37,12 +38,12 @@ static inline size_t genradix_depth_size(unsigned depth)
 #define GENRADIX_DEPTH_MASK                            \
        ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
 
-unsigned genradix_root_to_depth(struct genradix_root *r)
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
 {
        return (unsigned long) r & GENRADIX_DEPTH_MASK;
 }
 
-struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
 {
        return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
 }
@@ -76,6 +77,27 @@ void *__genradix_ptr(struct __genradix *radix, size_t offset)
 }
 EXPORT_SYMBOL(__genradix_ptr);
 
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+       struct genradix_node *node;
+
+       node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
+
+       /*
+        * We're using pages (not slab allocations) directly for kernel data
+        * structures, so we need to explicitly inform kmemleak of them in order
+        * to avoid false positive memory leak reports.
+        */
+       kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
+       return node;
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+       kmemleak_free(node);
+       free_page((unsigned long)node);
+}
+
 /*
  * Returns pointer to the specified byte @offset within @radix, allocating it if
  * necessary - newly allocated slots are always zeroed out:
@@ -98,8 +120,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
                        break;
 
                if (!new_node) {
-                       new_node = (void *)
-                               __get_free_page(gfp_mask|__GFP_ZERO);
+                       new_node = genradix_alloc_node(gfp_mask);
                        if (!new_node)
                                return NULL;
                }
@@ -122,8 +143,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
                n = READ_ONCE(*p);
                if (!n) {
                        if (!new_node) {
-                               new_node = (void *)
-                                       __get_free_page(gfp_mask|__GFP_ZERO);
+                               new_node = genradix_alloc_node(gfp_mask);
                                if (!new_node)
                                        return NULL;
                        }
@@ -134,7 +154,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
        }
 
        if (new_node)
-               free_page((unsigned long) new_node);
+               genradix_free_node(new_node);
 
        return &n->data[offset];
 }
@@ -193,6 +213,64 @@ restart:
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+                               struct __genradix *radix,
+                               size_t objs_per_page,
+                               size_t obj_size_plus_page_remainder)
+{
+       struct genradix_root *r;
+       struct genradix_node *n;
+       unsigned level, i;
+
+       if (iter->offset == SIZE_MAX)
+               return NULL;
+
+restart:
+       r = READ_ONCE(radix->root);
+       if (!r)
+               return NULL;
+
+       n       = genradix_root_to_node(r);
+       level   = genradix_root_to_depth(r);
+
+       if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+               iter->offset = genradix_depth_size(level);
+               iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+               iter->offset -= obj_size_plus_page_remainder;
+               iter->pos--;
+       }
+
+       while (level) {
+               level--;
+
+               i = (iter->offset >> genradix_depth_shift(level)) &
+                       (GENRADIX_ARY - 1);
+
+               while (!n->children[i]) {
+                       size_t objs_per_ptr = genradix_depth_size(level);
+
+                       iter->offset = round_down(iter->offset, objs_per_ptr);
+                       iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+                       if (!iter->offset)
+                               return NULL;
+
+                       iter->offset -= obj_size_plus_page_remainder;
+                       iter->pos--;
+
+                       if (!i)
+                               goto restart;
+                       --i;
+               }
+
+               n = n->children[i];
+       }
+
+       return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
 static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 {
        if (level) {
@@ -203,7 +281,7 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
                                genradix_free_recurse(n->children[i], level - 1);
        }
 
-       free_page((unsigned long) n);
+       genradix_free_node(n);
 }
 
 int __genradix_prealloc(struct __genradix *radix, size_t size,
diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c
new file mode 100644 (file)
index 0000000..a8170bb
--- /dev/null
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
+
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+
+/**
+ * int_sqrt - computes the integer square root
+ * @x: integer of which to calculate the sqrt
+ *
+ * Computes: floor(sqrt(x))
+ */
+unsigned long int_sqrt(unsigned long x)
+{
+       unsigned long b, m, y = 0;
+
+       if (x <= 1)
+               return x;
+
+       m = 1UL << (__fls(x) & ~1UL);
+       while (m != 0) {
+               b = y + m;
+               y >>= 1;
+
+               if (x >= b) {
+                       x -= b;
+                       y += m;
+               }
+               m >>= 2;
+       }
+
+       return y;
+}
+EXPORT_SYMBOL(int_sqrt);
+
+#if BITS_PER_LONG < 64
+/**
+ * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
+ * is expected.
+ * @x: 64bit integer of which to calculate the sqrt
+ */
+u32 int_sqrt64(u64 x)
+{
+       u64 b, m, y = 0;
+
+       if (x <= ULONG_MAX)
+               return int_sqrt((unsigned long) x);
+
+       m = 1ULL << ((fls64(x) - 1) & ~1ULL);
+       while (m != 0) {
+               b = y + m;
+               y >>= 1;
+
+               if (x >= b) {
+                       x -= b;
+                       y += m;
+               }
+               m >>= 2;
+       }
+
+       return y;
+}
+EXPORT_SYMBOL(int_sqrt64);
+#endif
index 41bfca2f8d522b111fb967f27d14cb1f24d2385f..3c7bdb81dff57981a44ad8b1f42347892ef1b8c1 100644 (file)
@@ -71,8 +71,10 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
        p->thread_fn    = thread_fn;
        p->thread_data  = thread_data;
        p->state        = TASK_UNINTERRUPTIBLE;
+       p->signal       = &p->_signal;
        atomic_set(&p->usage, 1);
        init_completion(&p->exited);
+       init_rwsem(&p->_signal.exec_update_lock);
 
        pthread_attr_t attr;
        pthread_attr_init(&attr);
diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c
new file mode 100644 (file)
index 0000000..643e311
--- /dev/null
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright Â© 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/mean_and_variance.h>
+#include <linux/module.h>
+#include <linux/printbuf.h>
+
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+inline s64 fast_divpow2(s64 n, u8 d)
+{
+       return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
+{
+       struct mean_and_variance s2;
+       u64 v2 = abs(v1);
+
+       s2.n           = s1.n + 1;
+       s2.sum         = s1.sum + v1;
+       s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
+       return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_update);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+       return div64_u64(s.sum, s.n);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+       u128 s2 = u128_div(s1.sum_squares, s1.n);
+       u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+       return u128_to_u64(u128_sub(s2, u128_square(s3)));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+       return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
+                                                                   s64 x)
+{
+       struct mean_and_variance_weighted s2;
+       // previous weighted variance.
+       u64 var_w0 = s1.variance;
+       u8 w = s2.w = s1.w;
+       // new value weighted.
+       s64 x_w = x << w;
+       s64 diff_w = x_w - s1.mean;
+       s64 diff = fast_divpow2(diff_w, w);
+       // new mean weighted.
+       s64 u_w1     = s1.mean + diff;
+
+       BUG_ON(w % 2 != 0);
+
+       if (!s1.init) {
+               s2.mean = x_w;
+               s2.variance = 0;
+       } else {
+               s2.mean = u_w1;
+               s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+       }
+       s2.init = true;
+
+       return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+       return fast_divpow2(s.mean, s.w);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+       // always positive don't need fast divpow2
+       return s.variance >> s.w;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+       return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/linux/pretty-printers.c b/linux/pretty-printers.c
new file mode 100644 (file)
index 0000000..addbac9
--- /dev/null
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/printbuf.h>
+#include <linux/pretty-printers.h>
+
+/**
+ * prt_string_option - Given a list of strings, print out the list and indicate
+ * which option is selected, with square brackets (sysfs style)
+ *
+ * @out: The printbuf to output to
+ * @list: List of strings to choose from
+ * @selected: The option to highlight, with square brackets
+ */
+void prt_string_option(struct printbuf *out,
+                      const char * const list[],
+                      size_t selected)
+{
+       size_t i;
+
+       for (i = 0; list[i]; i++) {
+               if (i)
+                       prt_char(out, ' ');
+               if (i == selected)
+                       prt_char(out, '[');
+               prt_str(out, list[i]);
+               if (i == selected)
+                       prt_char(out, ']');
+       }
+}
+EXPORT_SYMBOL(prt_string_option);
+
+/**
+ * prt_bitflags: Given a bitmap and a list of names for each bit, print out which
+ * bits are on, comma separated
+ *
+ * @out: The printbuf to output to
+ * @list: List of names for each bit
+ * @flags: Bits to print
+ */
+void prt_bitflags(struct printbuf *out,
+                 const char * const list[], u64 flags)
+{
+       unsigned bit, nr = 0;
+       bool first = true;
+
+       while (list[nr])
+               nr++;
+
+       while (flags && (bit = __ffs(flags)) < nr) {
+               if (!first)
+                       prt_char(out, ',');
+               first = false;
+               prt_str(out, list[bit]);
+               flags ^= 1 << bit;
+       }
+}
+EXPORT_SYMBOL(prt_bitflags);
diff --git a/linux/printbuf.c b/linux/printbuf.c
new file mode 100644 (file)
index 0000000..5cf79d4
--- /dev/null
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/printbuf.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+       return buf->pos - buf->last_newline;
+}
+
+int printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+       unsigned new_size;
+       char *buf;
+
+       if (!out->heap_allocated)
+               return 0;
+
+       /* Reserved space for terminating nul: */
+       extra += 1;
+
+       if (out->pos + extra < out->size)
+               return 0;
+
+       new_size = roundup_pow_of_two(out->size + extra);
+
+       /*
+        * Note: output buffer must be freeable with kfree(), it's not required
+        * that the user use printbuf_exit().
+        */
+       buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+       if (!buf) {
+               out->allocation_failure = true;
+               return -ENOMEM;
+       }
+
+       out->buf        = buf;
+       out->size       = new_size;
+       return 0;
+}
+EXPORT_SYMBOL(printbuf_make_room);
+
+/**
+ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
+ * terminated
+ */
+const char *printbuf_str(const struct printbuf *buf)
+{
+       /*
+        * If we've written to a printbuf then it's guaranteed to be a null
+        * terminated string - but if we haven't, then we might not have
+        * allocated a buffer at all:
+        */
+       return buf->pos
+               ? buf->buf
+               : "";
+}
+EXPORT_SYMBOL(printbuf_str);
+
+/**
+ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ */
+void printbuf_exit(struct printbuf *buf)
+{
+       if (buf->heap_allocated) {
+               kfree(buf->buf);
+               buf->buf = ERR_PTR(-EINTR); /* poison value */
+       }
+}
+EXPORT_SYMBOL(printbuf_exit);
+
+void printbuf_tabstops_reset(struct printbuf *buf)
+{
+       buf->nr_tabstops = 0;
+}
+EXPORT_SYMBOL(printbuf_tabstops_reset);
+
+void printbuf_tabstop_pop(struct printbuf *buf)
+{
+       if (buf->nr_tabstops)
+               --buf->nr_tabstops;
+}
+EXPORT_SYMBOL(printbuf_tabstop_pop);
+
+/*
+ * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+       unsigned prev_tabstop = buf->nr_tabstops
+               ? buf->_tabstops[buf->nr_tabstops - 1]
+               : 0;
+
+       if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+               return -EINVAL;
+
+       buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+       buf->has_indent_or_tabstops = true;
+       return 0;
+}
+EXPORT_SYMBOL(printbuf_tabstop_push);
+
+/**
+ * printbuf_indent_add - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+       if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+               spaces = 0;
+
+       buf->indent += spaces;
+       prt_chars(buf, ' ', spaces);
+
+       buf->has_indent_or_tabstops = true;
+}
+EXPORT_SYMBOL(printbuf_indent_add);
+
+/**
+ * printbuf_indent_sub - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+       if (WARN_ON_ONCE(spaces > buf->indent))
+               spaces = buf->indent;
+
+       if (buf->last_newline + buf->indent == buf->pos) {
+               buf->pos -= spaces;
+               printbuf_nul_terminate(buf);
+       }
+       buf->indent -= spaces;
+
+       if (!buf->indent && !buf->nr_tabstops)
+               buf->has_indent_or_tabstops = false;
+}
+EXPORT_SYMBOL(printbuf_indent_sub);
+
+void prt_newline(struct printbuf *buf)
+{
+       unsigned i;
+
+       printbuf_make_room(buf, 1 + buf->indent);
+
+       __prt_char(buf, '\n');
+
+       buf->last_newline       = buf->pos;
+
+       for (i = 0; i < buf->indent; i++)
+               __prt_char(buf, ' ');
+
+       printbuf_nul_terminate(buf);
+
+       buf->last_field         = buf->pos;
+       buf->cur_tabstop        = 0;
+}
+EXPORT_SYMBOL(prt_newline);
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+       return buf->cur_tabstop < buf->nr_tabstops
+               ? buf->_tabstops[buf->cur_tabstop]
+               : 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+       int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+       prt_chars(out, ' ', spaces);
+
+       out->last_field = out->pos;
+       out->cur_tabstop++;
+}
+
+/**
+ * prt_tab - Advance printbuf to the next tabstop
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void prt_tab(struct printbuf *out)
+{
+       if (WARN_ON(!cur_tabstop(out)))
+               return;
+
+       __prt_tab(out);
+}
+EXPORT_SYMBOL(prt_tab);
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+       unsigned move = buf->pos - buf->last_field;
+       int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+       if (pad > 0) {
+               printbuf_make_room(buf, pad);
+
+               if (buf->last_field + pad < buf->size)
+                       memmove(buf->buf + buf->last_field + pad,
+                               buf->buf + buf->last_field,
+                               min(move, buf->size - 1 - buf->last_field - pad));
+
+               if (buf->last_field < buf->size)
+                       memset(buf->buf + buf->last_field, ' ',
+                              min((unsigned) pad, buf->size - buf->last_field));
+
+               buf->pos += pad;
+               printbuf_nul_terminate(buf);
+       }
+
+       buf->last_field = buf->pos;
+       buf->cur_tabstop++;
+}
+
+/**
+ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void prt_tab_rjust(struct printbuf *buf)
+{
+       if (WARN_ON(!cur_tabstop(buf)))
+               return;
+
+       __prt_tab_rjust(buf);
+}
+EXPORT_SYMBOL(prt_tab_rjust);
+
+/**
+ * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ *
+ * @out: printbuf to output to
+ * @str: string to print
+ * @count: number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ *   \n: prt_newline   newline that obeys current indent level
+ *   \t: prt_tab       advance to next tabstop
+ *   \r: prt_tab_rjust advance to next tabstop, with right justification
+ */
+void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+       const char *unprinted_start = str;
+       const char *end = str + count;
+
+       if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+               prt_bytes(out, str, count);
+               return;
+       }
+
+       while (str != end) {
+               switch (*str) {
+               case '\n':
+                       prt_bytes(out, unprinted_start, str - unprinted_start);
+                       unprinted_start = str + 1;
+                       prt_newline(out);
+                       break;
+               case '\t':
+                       if (likely(cur_tabstop(out))) {
+                               prt_bytes(out, unprinted_start, str - unprinted_start);
+                               unprinted_start = str + 1;
+                               __prt_tab(out);
+                       }
+                       break;
+               case '\r':
+                       if (likely(cur_tabstop(out))) {
+                               prt_bytes(out, unprinted_start, str - unprinted_start);
+                               unprinted_start = str + 1;
+                               __prt_tab_rjust(out);
+                       }
+                       break;
+               }
+
+               str++;
+       }
+
+       prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+EXPORT_SYMBOL(prt_bytes_indented);
+
+/**
+ * prt_human_readable_u64 - Print out a u64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void prt_human_readable_u64(struct printbuf *buf, u64 v)
+{
+       printbuf_make_room(buf, 10);
+       buf->pos += string_get_size(v, 1, !buf->si_units,
+                                   buf->buf + buf->pos,
+                                   printbuf_remaining_size(buf));
+}
+EXPORT_SYMBOL(prt_human_readable_u64);
+
+/**
+ * prt_human_readable_s64 - Print out a s64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void prt_human_readable_s64(struct printbuf *buf, s64 v)
+{
+       if (v < 0)
+               prt_char(buf, '-');
+       prt_human_readable_u64(buf, abs(v));
+}
+EXPORT_SYMBOL(prt_human_readable_s64);
+
+/**
+ * prt_units_u64 - Print out a u64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void prt_units_u64(struct printbuf *out, u64 v)
+{
+       if (out->human_readable_units)
+               prt_human_readable_u64(out, v);
+       else
+               prt_printf(out, "%llu", v);
+}
+EXPORT_SYMBOL(prt_units_u64);
+
+/**
+ * prt_units_s64 - Print out a s64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void prt_units_s64(struct printbuf *out, s64 v)
+{
+       if (v < 0)
+               prt_char(out, '-');
+       prt_units_u64(out, abs(v));
+}
+EXPORT_SYMBOL(prt_units_s64);
diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c
new file mode 100644 (file)
index 0000000..df9567c
--- /dev/null
@@ -0,0 +1,29 @@
+
+#include <stdio.h>
+#include <linux/printbuf.h>
+
+void prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+       int len;
+
+       do {
+               va_list args2;
+
+               va_copy(args2, args);
+               len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+       } while (len + 1 >= printbuf_remaining(out) &&
+                !printbuf_make_room(out, len + 1));
+
+       len = min_t(size_t, len,
+                 printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+       out->pos += len;
+}
+
+void prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       prt_vprintf(out, fmt, args);
+       va_end(args);
+}
diff --git a/linux/ratelimit.c b/linux/ratelimit.c
new file mode 100644 (file)
index 0000000..21a6d6c
--- /dev/null
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ratelimit.c - Do something with rate limit.
+ *
+ * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
+ *
+ * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
+ * parameter. Now every user can use their own standalone ratelimit_state.
+ */
+
+#include <linux/ratelimit.h>
+#include <linux/jiffies.h>
+#include <linux/export.h>
+
+/*
+ * __ratelimit - rate limiting
+ * @rs: ratelimit_state data
+ * @func: name of calling function
+ *
+ * This enforces a rate limit: not more than @rs->burst callbacks
+ * in every @rs->interval
+ *
+ * RETURNS:
+ * 0 means callbacks will be suppressed.
+ * 1 means go ahead and do it.
+ */
+int ___ratelimit(struct ratelimit_state *rs, const char *func)
+{
+       int ret;
+
+       if (!rs->interval)
+               return 1;
+
+       /*
+        * If we contend on this state's lock then almost
+        * by definition we are too busy to print a message,
+        * in addition to the one that will be printed by
+        * the entity that is holding the lock already:
+        */
+       if (!raw_spin_trylock(&rs->lock))
+               return 0;
+
+       if (!rs->begin)
+               rs->begin = jiffies;
+
+       if (time_is_before_jiffies(rs->begin + rs->interval)) {
+               if (rs->missed) {
+                       if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
+                               printk(KERN_WARNING
+                                      "%s: %d callbacks suppressed\n",
+                                      func, rs->missed);
+                               rs->missed = 0;
+                       }
+               }
+               rs->begin   = jiffies;
+               rs->printed = 0;
+       }
+       if (rs->burst && rs->burst > rs->printed) {
+               rs->printed++;
+               ret = 1;
+       } else {
+               rs->missed++;
+               ret = 0;
+       }
+       raw_spin_unlock(&rs->lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(___ratelimit);
index f6c979aa6ae1bf751139ad604c3e56ead7cc777f..23e288d845cb67da8030672e58038cbe5b49d2df 100644 (file)
@@ -2,6 +2,7 @@
 #include <stdio.h>
 
 #include <linux/list.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/shrinker.h>
 
@@ -10,7 +11,7 @@
 static LIST_HEAD(shrinker_list);
 static DEFINE_MUTEX(shrinker_lock);
 
-int register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
 {
        mutex_lock(&shrinker_lock);
        list_add_tail(&shrinker->list, &shrinker_list);
@@ -39,46 +40,71 @@ static u64 parse_meminfo_line(const char *line)
        return v << 10;
 }
 
-static struct meminfo read_meminfo(void)
+void si_meminfo(struct sysinfo *val)
 {
-       struct meminfo ret = { 0 };
        size_t len, n = 0;
        char *line = NULL;
        const char *v;
        FILE *f;
 
+       memset(val, 0, sizeof(*val));
+       val->mem_unit = 1;
+
        f = fopen("/proc/meminfo", "r");
        if (!f)
-               return ret;
+               return;
 
        while ((len = getline(&line, &n, f)) != -1) {
                if ((v = strcmp_prefix(line, "MemTotal:")))
-                       ret.total = parse_meminfo_line(v);
+                       val->totalram = parse_meminfo_line(v);
 
                if ((v = strcmp_prefix(line, "MemAvailable:")))
-                       ret.available = parse_meminfo_line(v);
+                       val->freeram = parse_meminfo_line(v);
        }
 
        fclose(f);
        free(line);
+}
+
+static void run_shrinkers_allocation_failed(gfp_t gfp_mask)
+{
+       struct shrinker *shrinker;
+
+       mutex_lock(&shrinker_lock);
+       list_for_each_entry(shrinker, &shrinker_list, list) {
+               struct shrink_control sc = { .gfp_mask  = gfp_mask, };
+
+               unsigned long have = shrinker->count_objects(shrinker, &sc);
 
-       return ret;
+               sc.nr_to_scan = have / 8;
+
+               shrinker->scan_objects(shrinker, &sc);
+       }
+       mutex_unlock(&shrinker_lock);
 }
 
-void run_shrinkers(void)
+void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
 {
        struct shrinker *shrinker;
-       struct meminfo info;
+       struct sysinfo info;
        s64 want_shrink;
 
+       if (!(gfp_mask & GFP_KERNEL))
+               return;
+
        /* Fast out if there are no shrinkers to run. */
        if (list_empty(&shrinker_list))
                return;
 
-       info = read_meminfo();
+       if (allocation_failed) {
+               run_shrinkers_allocation_failed(gfp_mask);
+               return;
+       }
+
+       si_meminfo(&info);
 
-       if (info.total && info.available) {
-               want_shrink = (info.total >> 2) - info.available;
+       if (info.totalram && info.freeram) {
+               want_shrink = (info.totalram >> 2) - info.freeram;
 
                if (want_shrink <= 0)
                        return;
@@ -92,7 +118,8 @@ void run_shrinkers(void)
        mutex_lock(&shrinker_lock);
        list_for_each_entry(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
-                       .nr_to_scan = want_shrink >> PAGE_SHIFT
+                       .gfp_mask       = gfp_mask,
+                       .nr_to_scan     = want_shrink >> PAGE_SHIFT
                };
 
                shrinker->scan_objects(shrinker, &sc);
index fca1208720b67dfd7e96915679572737dd626dba..39f7ea79fdb17d121b5dfa3f80dffe5bf3bb041b 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
 #include <linux/six.h>
 #include <linux/slab.h>
 #define EBUG_ON(cond)          do {} while (0)
 #endif
 
-#define six_acquire(l, t)      lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_acquire(l, t, r)   lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
 #define six_release(l)         lock_release(l, _RET_IP_)
 
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
 struct six_lock_vals {
        /* Value we add to the lock in order to take the lock: */
        u64                     lock_val;
@@ -65,14 +68,15 @@ struct six_lock_vals {
 }
 
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-                                union six_lock_state old)
+                                union six_lock_state old,
+                                struct task_struct *owner)
 {
        if (type != SIX_LOCK_intent)
                return;
 
        if (!old.intent_lock) {
                EBUG_ON(lock->owner);
-               lock->owner = current;
+               lock->owner = owner;
        } else {
                EBUG_ON(lock->owner != current);
        }
@@ -88,64 +92,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
        return read_count;
 }
 
-struct six_lock_waiter {
-       struct list_head        list;
-       struct task_struct      *task;
-};
-
 /* This is probably up there with the more evil things I've done */
 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-                                  union six_lock_state state,
-                                  unsigned waitlist_id)
-{
-       if (waitlist_id == SIX_LOCK_write) {
-               if (state.write_locking && !state.read_lock) {
-                       struct task_struct *p = READ_ONCE(lock->owner);
-                       if (p)
-                               wake_up_process(p);
-               }
-       } else {
-               struct list_head *wait_list = &lock->wait_list[waitlist_id];
-               struct six_lock_waiter *w, *next;
-
-               if (!(state.waiters & (1 << waitlist_id)))
-                       return;
-
-               clear_bit(waitlist_bitnr(waitlist_id),
-                         (unsigned long *) &lock->state.v);
-
-               raw_spin_lock(&lock->wait_lock);
-
-               list_for_each_entry_safe(w, next, wait_list, list) {
-                       list_del_init(&w->list);
-
-                       if (wake_up_process(w->task) &&
-                           waitlist_id != SIX_LOCK_read) {
-                               if (!list_empty(wait_list))
-                                       set_bit(waitlist_bitnr(waitlist_id),
-                                               (unsigned long *) &lock->state.v);
-                               break;
-                       }
-               }
-
-               raw_spin_unlock(&lock->wait_lock);
-       }
-}
-
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-                                               enum six_lock_type type,
-                                               bool try)
+static int __do_six_trylock_type(struct six_lock *lock,
+                                enum six_lock_type type,
+                                struct task_struct *task,
+                                bool try)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old, new;
-       bool ret;
+       int ret;
        u64 v;
 
-       EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+       EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
        EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-
        EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
 
        /*
@@ -164,7 +125,6 @@ static __always_inline bool do_six_trylock_type(struct six_lock *lock,
         */
 
        if (type == SIX_LOCK_read && lock->readers) {
-retry:
                preempt_disable();
                this_cpu_inc(*lock->readers); /* signal that we own lock */
 
@@ -181,38 +141,21 @@ retry:
                 * lock, issue a wakeup because we might have caused a
                 * spurious trylock failure:
                 */
-               if (old.write_locking) {
-                       struct task_struct *p = READ_ONCE(lock->owner);
-
-                       if (p)
-                               wake_up_process(p);
-               }
-
-               /*
-                * If we failed from the lock path and the waiting bit wasn't
-                * set, set it:
-                */
-               if (!try && !ret) {
-                       v = old.v;
-
-                       do {
-                               new.v = old.v = v;
-
-                               if (!(old.v & l[type].lock_fail))
-                                       goto retry;
-
-                               if (new.waiters & (1 << type))
-                                       break;
-
-                               new.waiters |= 1 << type;
-                       } while ((v = atomic64_cmpxchg(&lock->state.counter,
-                                                      old.v, new.v)) != old.v);
-               }
+               if (old.write_locking)
+                       ret = -1 - SIX_LOCK_write;
        } else if (type == SIX_LOCK_write && lock->readers) {
                if (try) {
                        atomic64_add(__SIX_VAL(write_locking, 1),
                                     &lock->state.counter);
                        smp_mb__after_atomic();
+               } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
+                       atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
+                                    &lock->state.counter);
+                       /*
+                        * pairs with barrier after unlock and before checking
+                        * for readers in unlock path
+                        */
+                       smp_mb__after_atomic();
                }
 
                ret = !pcpu_read_count(lock);
@@ -229,7 +172,8 @@ retry:
 
                if (try && !ret) {
                        old.v = atomic64_add_return(v, &lock->state.counter);
-                       six_lock_wakeup(lock, old, SIX_LOCK_read);
+                       if (old.waiters & (1 << SIX_LOCK_read))
+                               ret = -1 - SIX_LOCK_read;
                } else {
                        atomic64_add(v, &lock->state.counter);
                }
@@ -243,8 +187,7 @@ retry:
 
                                if (type == SIX_LOCK_write)
                                        new.write_locking = 0;
-                       } else if (!try && type != SIX_LOCK_write &&
-                                  !(new.waiters & (1 << type)))
+                       } else if (!try && !(new.waiters & (1 << type)))
                                new.waiters |= 1 << type;
                        else
                                break; /* waiting bit already set */
@@ -256,14 +199,84 @@ retry:
                EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
        }
 
-       if (ret)
-               six_set_owner(lock, type, old);
+       if (ret > 0)
+               six_set_owner(lock, type, old, task);
 
-       EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+       EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
 
        return ret;
 }
 
+static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+       struct six_lock_waiter *w, *next;
+       struct task_struct *task;
+       bool saw_one;
+       int ret;
+again:
+       ret = 0;
+       saw_one = false;
+       raw_spin_lock(&lock->wait_lock);
+
+       list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+               if (w->lock_want != lock_type)
+                       continue;
+
+               if (saw_one && lock_type != SIX_LOCK_read)
+                       goto unlock;
+               saw_one = true;
+
+               ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+               if (ret <= 0)
+                       goto unlock;
+
+               __list_del(w->list.prev, w->list.next);
+               task = w->task;
+               /*
+                * Do no writes to @w besides setting lock_acquired - otherwise
+                * we would need a memory barrier:
+                */
+               barrier();
+               w->lock_acquired = true;
+               wake_up_process(task);
+       }
+
+       clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+unlock:
+       raw_spin_unlock(&lock->wait_lock);
+
+       if (ret < 0) {
+               lock_type = -ret - 1;
+               goto again;
+       }
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+                                  union six_lock_state state,
+                                  enum six_lock_type lock_type)
+{
+       if (lock_type == SIX_LOCK_write && state.read_lock)
+               return;
+
+       if (!(state.waiters & (1 << lock_type)))
+               return;
+
+       __six_lock_wakeup(lock, lock_type);
+}
+
+static bool do_six_trylock_type(struct six_lock *lock,
+                               enum six_lock_type type,
+                               bool try)
+{
+       int ret;
+
+       ret = __do_six_trylock_type(lock, type, current, try);
+       if (ret < 0)
+               __six_lock_wakeup(lock, -ret - 1);
+
+       return ret > 0;
+}
+
 __always_inline __flatten
 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
@@ -271,7 +284,7 @@ static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
                return false;
 
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1);
+               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
        return true;
 }
 
@@ -304,15 +317,11 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
                 * Similar to the lock path, we may have caused a spurious write
                 * lock fail and need to issue a wakeup:
                 */
-               if (old.write_locking) {
-                       struct task_struct *p = READ_ONCE(lock->owner);
-
-                       if (p)
-                               wake_up_process(p);
-               }
+               if (old.write_locking)
+                       six_lock_wakeup(lock, old, SIX_LOCK_write);
 
                if (ret)
-                       six_acquire(&lock->dep_map, 1);
+                       six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
 
                return ret;
        }
@@ -327,41 +336,34 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
                                old.v,
                                old.v + l[type].lock_val)) != old.v);
 
-       six_set_owner(lock, type, old);
+       six_set_owner(lock, type, old, current);
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1);
+               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
        return true;
 }
 
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+                                      struct six_lock_waiter *wait)
 {
-       struct task_struct *owner;
-       int retval = 1;
+       struct task_struct *owner, *task = current;
 
-       if (need_resched())
-               return 0;
+       switch (wait->lock_want) {
+       case SIX_LOCK_read:
+               break;
+       case SIX_LOCK_intent:
+               if (lock->wait_list.next != &wait->list)
+                       return false;
+               break;
+       case SIX_LOCK_write:
+               return false;
+       }
 
        rcu_read_lock();
        owner = READ_ONCE(lock->owner);
-       if (owner)
-               retval = owner->on_cpu;
-       rcu_read_unlock();
-       /*
-        * if lock->owner is not set, the mutex owner may have just acquired
-        * it and not set the owner yet or the mutex has been released.
-        */
-       return retval;
-}
-
-static inline bool six_spin_on_owner(struct six_lock *lock,
-                                    struct task_struct *owner)
-{
-       bool ret = true;
 
-       rcu_read_lock();
-       while (lock->owner == owner) {
+       while (owner && lock->owner == owner) {
                /*
                 * Ensure we emit the owner->on_cpu, dereference _after_
                 * checking lock->owner still matches owner. If that fails,
@@ -370,85 +372,27 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
                 */
                barrier();
 
-               if (!owner->on_cpu || need_resched()) {
-                       ret = false;
-                       break;
-               }
-
-               cpu_relax();
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
-       struct task_struct *task = current;
-
-       if (type == SIX_LOCK_write)
-               return false;
-
-       preempt_disable();
-       if (!six_can_spin_on_owner(lock))
-               goto fail;
-
-       if (!osq_lock(&lock->osq))
-               goto fail;
-
-       while (1) {
-               struct task_struct *owner;
-
                /*
-                * If there's an owner, wait for it to either
-                * release the lock or go to sleep.
-                */
-               owner = READ_ONCE(lock->owner);
-               if (owner && !six_spin_on_owner(lock, owner))
-                       break;
-
-               if (do_six_trylock_type(lock, type, false)) {
-                       osq_unlock(&lock->osq);
-                       preempt_enable();
-                       return true;
-               }
-
-               /*
-                * When there's no owner, we might have preempted between the
-                * owner acquiring the lock and setting the owner field. If
-                * we're an RT task that will live-lock because we won't let
+                * If we're an RT task that will live-lock because we won't let
                 * the owner complete.
                 */
-               if (!owner && (need_resched() || rt_task(task)))
+               if (wait->lock_acquired ||
+                   !owner->on_cpu ||
+                   rt_task(task) ||
+                   need_resched())
                        break;
 
-               /*
-                * The cpu_relax() call is a compiler barrier which forces
-                * everything in this loop to be re-loaded. We don't need
-                * memory barriers as we'll eventually observe the right
-                * values at the cost of a few extra spins.
-                */
                cpu_relax();
        }
+       rcu_read_unlock();
 
-       osq_unlock(&lock->osq);
-fail:
-       preempt_enable();
-
-       /*
-        * If we fell out of the spin path because of need_resched(),
-        * reschedule now, before we try-lock again. This avoids getting
-        * scheduled out right after we obtained the lock.
-        */
-       if (need_resched())
-               schedule();
-
-       return false;
+       return wait->lock_acquired;
 }
 
 #else /* CONFIG_LOCK_SPIN_ON_OWNER */
 
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+                                      struct six_lock_waiter *wait)
 {
        return false;
 }
@@ -457,10 +401,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 
 noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+                                   struct six_lock_waiter *wait,
                                    six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
        union six_lock_state old;
-       struct six_lock_waiter wait;
        int ret = 0;
 
        if (type == SIX_LOCK_write) {
@@ -469,47 +413,73 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                smp_mb__after_atomic();
        }
 
-       ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-       if (ret)
-               goto out_before_sleep;
+       lock_contended(&lock->dep_map, _RET_IP_);
 
-       if (six_optimistic_spin(lock, type))
-               goto out_before_sleep;
+       wait->task              = current;
+       wait->lock_want         = type;
+       wait->lock_acquired     = false;
 
-       lock_contended(&lock->dep_map, _RET_IP_);
+       raw_spin_lock(&lock->wait_lock);
+       if (!(lock->state.waiters & (1 << type)))
+               set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+       /*
+        * Retry taking the lock after taking waitlist lock, have raced with an
+        * unlock:
+        */
+       ret = __do_six_trylock_type(lock, type, current, false);
+       if (ret <= 0) {
+               wait->start_time = local_clock();
 
-       INIT_LIST_HEAD(&wait.list);
-       wait.task = current;
+               if (!list_empty(&lock->wait_list)) {
+                       struct six_lock_waiter *last =
+                               list_last_entry(&lock->wait_list,
+                                       struct six_lock_waiter, list);
+
+                       if (time_before_eq64(wait->start_time, last->start_time))
+                               wait->start_time = last->start_time + 1;
+               }
+
+               list_add_tail(&wait->list, &lock->wait_list);
+       }
+       raw_spin_unlock(&lock->wait_lock);
+
+       if (unlikely(ret > 0)) {
+               ret = 0;
+               goto out;
+       }
+
+       if (unlikely(ret < 0)) {
+               __six_lock_wakeup(lock, -ret - 1);
+               ret = 0;
+       }
+
+       if (six_optimistic_spin(lock, wait))
+               goto out;
 
        while (1) {
                set_current_state(TASK_UNINTERRUPTIBLE);
-               if (type == SIX_LOCK_write)
-                       EBUG_ON(lock->owner != current);
-               else if (list_empty_careful(&wait.list)) {
-                       raw_spin_lock(&lock->wait_lock);
-                       list_add_tail(&wait.list, &lock->wait_list[type]);
-                       raw_spin_unlock(&lock->wait_lock);
-               }
 
-               if (do_six_trylock_type(lock, type, false))
+               if (wait->lock_acquired)
                        break;
 
                ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-               if (ret)
+               if (unlikely(ret)) {
+                       raw_spin_lock(&lock->wait_lock);
+                       if (!wait->lock_acquired)
+                               list_del(&wait->list);
+                       raw_spin_unlock(&lock->wait_lock);
+
+                       if (wait->lock_acquired)
+                               do_six_unlock_type(lock, type);
                        break;
+               }
 
                schedule();
        }
 
        __set_current_state(TASK_RUNNING);
-
-       if (!list_empty_careful(&wait.list)) {
-               raw_spin_lock(&lock->wait_lock);
-               list_del_init(&wait.list);
-               raw_spin_unlock(&lock->wait_lock);
-       }
-out_before_sleep:
-       if (ret && type == SIX_LOCK_write) {
+out:
+       if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
                old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
                                            &lock->state.counter);
                six_lock_wakeup(lock, old, SIX_LOCK_read);
@@ -518,17 +488,20 @@ out_before_sleep:
        return ret;
 }
 
-__always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                          six_lock_should_sleep_fn should_sleep_fn, void *p)
+__always_inline __flatten
+static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+                        struct six_lock_waiter *wait,
+                        six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
        int ret;
 
+       wait->start_time = 0;
+
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 0);
+               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
 
        ret = do_six_trylock_type(lock, type, true) ? 0
-               : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+               : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
 
        if (ret && type != SIX_LOCK_write)
                six_release(&lock->dep_map);
@@ -538,28 +511,23 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
        return ret;
 }
 
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+                          six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       struct six_lock_waiter wait;
+
+       return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+}
+
 __always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state state;
 
-       EBUG_ON(type == SIX_LOCK_write &&
-               !(lock->state.v & __SIX_LOCK_HELD_intent));
-
-       if (type != SIX_LOCK_write)
-               six_release(&lock->dep_map);
-
-       if (type == SIX_LOCK_intent) {
-               EBUG_ON(lock->owner != current);
-
-               if (lock->intent_lock_recurse) {
-                       --lock->intent_lock_recurse;
-                       return;
-               }
-
+       if (type == SIX_LOCK_intent)
                lock->owner = NULL;
-       }
 
        if (type == SIX_LOCK_read &&
            lock->readers) {
@@ -576,6 +544,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
        six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       EBUG_ON(type == SIX_LOCK_write &&
+               !(lock->state.v & __SIX_LOCK_HELD_intent));
+       EBUG_ON((type == SIX_LOCK_write ||
+                type == SIX_LOCK_intent) &&
+               lock->owner != current);
+
+       if (type != SIX_LOCK_write)
+               six_release(&lock->dep_map);
+
+       if (type == SIX_LOCK_intent &&
+           lock->intent_lock_recurse) {
+               --lock->intent_lock_recurse;
+               return;
+       }
+
+       do_six_unlock_type(lock, type);
+}
+
 #define __SIX_LOCK(type)                                               \
 bool six_trylock_##type(struct six_lock *lock)                         \
 {                                                                      \
@@ -596,6 +585,14 @@ int six_lock_##type(struct six_lock *lock,                         \
 }                                                                      \
 EXPORT_SYMBOL_GPL(six_lock_##type);                                    \
                                                                        \
+int six_lock_waiter_##type(struct six_lock *lock,                      \
+                          struct six_lock_waiter *wait,                \
+                          six_lock_should_sleep_fn should_sleep_fn, void *p)\
+{                                                                      \
+       return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+}                                                                      \
+EXPORT_SYMBOL_GPL(six_lock_waiter_##type);                             \
+                                                                       \
 void six_unlock_##type(struct six_lock *lock)                          \
 {                                                                      \
        __six_unlock_type(lock, SIX_LOCK_##type);                       \
@@ -639,7 +636,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
        if (lock->readers)
                this_cpu_dec(*lock->readers);
 
-       six_set_owner(lock, SIX_LOCK_intent, old);
+       six_set_owner(lock, SIX_LOCK_intent, old, current);
 
        return true;
 }
@@ -671,7 +668,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
 
-       six_acquire(&lock->dep_map, 0);
+       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
 
        /* XXX: assert already locked, and that we don't overflow: */
 
@@ -698,47 +695,20 @@ EXPORT_SYMBOL_GPL(six_lock_increment);
 
 void six_lock_wakeup_all(struct six_lock *lock)
 {
+       union six_lock_state state = lock->state;
        struct six_lock_waiter *w;
 
-       raw_spin_lock(&lock->wait_lock);
+       six_lock_wakeup(lock, state, SIX_LOCK_read);
+       six_lock_wakeup(lock, state, SIX_LOCK_intent);
+       six_lock_wakeup(lock, state, SIX_LOCK_write);
 
-       list_for_each_entry(w, &lock->wait_list[0], list)
-               wake_up_process(w->task);
-       list_for_each_entry(w, &lock->wait_list[1], list)
+       raw_spin_lock(&lock->wait_lock);
+       list_for_each_entry(w, &lock->wait_list, list)
                wake_up_process(w->task);
-
        raw_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-struct free_pcpu_rcu {
-       struct rcu_head         rcu;
-       void __percpu           *p;
-};
-
-static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
-{
-       struct free_pcpu_rcu *rcu =
-               container_of(_rcu, struct free_pcpu_rcu, rcu);
-
-       free_percpu(rcu->p);
-       kfree(rcu);
-}
-
-void six_lock_pcpu_free_rcu(struct six_lock *lock)
-{
-       struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
-
-       if (!rcu)
-               return;
-
-       rcu->p = lock->readers;
-       lock->readers = NULL;
-
-       call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
-
 void six_lock_pcpu_free(struct six_lock *lock)
 {
        BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -757,3 +727,27 @@ void six_lock_pcpu_alloc(struct six_lock *lock)
 #endif
 }
 EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+       struct six_lock_count ret;
+
+       ret.n[SIX_LOCK_read]    = 0;
+       ret.n[SIX_LOCK_intent]  = lock->state.intent_lock + lock->intent_lock_recurse;
+       ret.n[SIX_LOCK_write]   = lock->state.seq & 1;
+
+       if (!lock->readers)
+               ret.n[SIX_LOCK_read] += lock->state.read_lock;
+       else {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
index fd2797eaf7739319543d1c2ac3c4be7a533d1513..a32a8995ddc46cb611490122bb43b60a007b0419 100644 (file)
 
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <string.h>
 
+#include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
 
@@ -62,6 +64,31 @@ size_t strlcpy(char *dest, const char *src, size_t size)
        return ret;
 }
 
+ssize_t strscpy(char *dest, const char *src, size_t count)
+{
+       long res = 0;
+
+       if (count == 0 || WARN_ON_ONCE(count > INT_MAX))
+               return -E2BIG;
+
+       while (count) {
+               char c;
+
+               c = src[res];
+               dest[res] = c;
+               if (!c)
+                       return res;
+               res++;
+               count--;
+       }
+
+       /* Hit buffer length without finding a NUL; force NUL-termination. */
+       if (res)
+               dest[res-1] = '\0';
+
+       return -E2BIG;
+}
+
 void memzero_explicit(void *s, size_t count)
 {
        memset(s, 0, count);
diff --git a/linux/string_helpers.c b/linux/string_helpers.c
new file mode 100644 (file)
index 0000000..29c498a
--- /dev/null
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Helpers for formatting and printing strings
+ *
+ * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
+ */
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/printbuf.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+/**
+ * string_get_size - get the size in the specified units
+ * @size:      The size to be converted in blocks
+ * @blk_size:  Size of the block (use 1 for size in bytes)
+ * @units:     units to use (powers of 1000 or 1024)
+ * @buf:       buffer to format to
+ * @len:       length of buffer
+ *
+ * This function returns a string formatted to 3 significant figures
+ * giving the size in the required units.  @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
+ *
+ */
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+                   char *buf, int len)
+{
+       static const char *const units_10[] = {
+               "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+       };
+       static const char *const units_2[] = {
+               "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+       };
+       static const char *const *const units_str[] = {
+               [STRING_UNITS_10] = units_10,
+               [STRING_UNITS_2] = units_2,
+       };
+       static const unsigned int divisor[] = {
+               [STRING_UNITS_10] = 1000,
+               [STRING_UNITS_2] = 1024,
+       };
+       static const unsigned int rounding[] = { 500, 50, 5 };
+       int i = 0, j;
+       u32 remainder = 0, sf_cap;
+       char tmp[12];
+       const char *unit;
+
+       tmp[0] = '\0';
+
+       if (blk_size == 0)
+               size = 0;
+       if (size == 0)
+               goto out;
+
+       /* This is Napier's algorithm.  Reduce the original block size to
+        *
+        * coefficient * divisor[units]^i
+        *
+        * we do the reduction so both coefficients are just under 32 bits so
+        * that multiplying them together won't overflow 64 bits and we keep
+        * as much precision as possible in the numbers.
+        *
+        * Note: it's safe to throw away the remainders here because all the
+        * precision is in the coefficients.
+        */
+       while (blk_size >> 32) {
+               do_div(blk_size, divisor[units]);
+               i++;
+       }
+
+       while (size >> 32) {
+               do_div(size, divisor[units]);
+               i++;
+       }
+
+       /* now perform the actual multiplication keeping i as the sum of the
+        * two logarithms */
+       size *= blk_size;
+
+       /* and logarithmically reduce it until it's just under the divisor */
+       while (size >= divisor[units]) {
+               remainder = do_div(size, divisor[units]);
+               i++;
+       }
+
+       /* work out in j how many digits of precision we need from the
+        * remainder */
+       sf_cap = size;
+       for (j = 0; sf_cap*10 < 1000; j++)
+               sf_cap *= 10;
+
+       if (units == STRING_UNITS_2) {
+               /* express the remainder as a decimal.  It's currently the
+                * numerator of a fraction whose denominator is
+                * divisor[units], which is 1 << 10 for STRING_UNITS_2 */
+               remainder *= 1000;
+               remainder >>= 10;
+       }
+
+       /* add a 5 to the digit below what will be printed to ensure
+        * an arithmetical round up and carry it through to size */
+       remainder += rounding[j];
+       if (remainder >= 1000) {
+               remainder -= 1000;
+               size += 1;
+       }
+
+       if (j) {
+               snprintf(tmp, sizeof(tmp), ".%03u", remainder);
+               tmp[j+1] = '\0';
+       }
+
+ out:
+       if (i >= ARRAY_SIZE(units_2))
+               unit = "UNK";
+       else
+               unit = units_str[units][i];
+
+       return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit);
+}
+EXPORT_SYMBOL(string_get_size);
index eb93786364eedd6282fa113b3458ecdd8c5bf9b6..7d519a4d85830108b8b0166ada31222d5fe6e59a 100644 (file)
@@ -93,9 +93,11 @@ do {                                                                 \
                                                                        \
        BUG_ON(_i >= (h)->used);                                        \
        (h)->used--;                                                    \
-       heap_swap(h, _i, (h)->used);                                    \
-       heap_sift_down(h, _i, cmp);                                     \
-       heap_sift(h, _i, cmp);                                          \
+       if ((_i) < (h)->used) {                                         \
+               heap_swap(h, _i, (h)->used);                            \
+               heap_sift_down(h, _i, cmp);                             \
+               heap_sift(h, _i, cmp);                                  \
+       }                                                               \
 } while (0)
 
 #define heap_pop(h, d, cmp)                                            \
diff --git a/linux/zstd_compress_module.c b/linux/zstd_compress_module.c
new file mode 100644 (file)
index 0000000..35cc5cb
--- /dev/null
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+#define ZSTD_FORWARD_IF_ERR(ret)            \
+       do {                                \
+               size_t const __ret = (ret); \
+               if (ZSTD_isError(__ret))    \
+                       return __ret;       \
+       } while (0)
+
+static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters,
+       unsigned long long pledged_src_size)
+{
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset(
+               cctx, ZSTD_reset_session_and_parameters));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize(
+               cctx, pledged_src_size));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_windowLog, parameters->cParams.windowLog));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_hashLog, parameters->cParams.hashLog));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_chainLog, parameters->cParams.chainLog));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_searchLog, parameters->cParams.searchLog));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_minMatch, parameters->cParams.minMatch));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_targetLength, parameters->cParams.targetLength));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_strategy, parameters->cParams.strategy));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag));
+       ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+               cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag));
+       return 0;
+}
+
+int zstd_min_clevel(void)
+{
+       return ZSTD_minCLevel();
+}
+EXPORT_SYMBOL(zstd_min_clevel);
+
+int zstd_max_clevel(void)
+{
+       return ZSTD_maxCLevel();
+}
+EXPORT_SYMBOL(zstd_max_clevel);
+
+size_t zstd_compress_bound(size_t src_size)
+{
+       return ZSTD_compressBound(src_size);
+}
+EXPORT_SYMBOL(zstd_compress_bound);
+
+zstd_parameters zstd_get_params(int level,
+       unsigned long long estimated_src_size)
+{
+       return ZSTD_getParams(level, estimated_src_size, 0);
+}
+EXPORT_SYMBOL(zstd_get_params);
+
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+{
+       return ZSTD_estimateCCtxSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+{
+       if (workspace == NULL)
+               return NULL;
+       return ZSTD_initStaticCCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_cctx);
+
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+       const void *src, size_t src_size, const zstd_parameters *parameters)
+{
+       ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size));
+       return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_compress_cctx);
+
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams)
+{
+       return ZSTD_estimateCStreamSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cstream_workspace_bound);
+
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+       unsigned long long pledged_src_size, void *workspace, size_t workspace_size)
+{
+       zstd_cstream *cstream;
+
+       if (workspace == NULL)
+               return NULL;
+
+       cstream = ZSTD_initStaticCStream(workspace, workspace_size);
+       if (cstream == NULL)
+               return NULL;
+
+       /* 0 means unknown in linux zstd API but means 0 in new zstd API */
+       if (pledged_src_size == 0)
+               pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN;
+
+       if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size)))
+               return NULL;
+
+       return cstream;
+}
+EXPORT_SYMBOL(zstd_init_cstream);
+
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+       unsigned long long pledged_src_size)
+{
+       return ZSTD_resetCStream(cstream, pledged_src_size);
+}
+EXPORT_SYMBOL(zstd_reset_cstream);
+
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+       zstd_in_buffer *input)
+{
+       return ZSTD_compressStream(cstream, output, input);
+}
+EXPORT_SYMBOL(zstd_compress_stream);
+
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+       return ZSTD_flushStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_flush_stream);
+
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+       return ZSTD_endStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_end_stream);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
diff --git a/linux/zstd_decompress_module.c b/linux/zstd_decompress_module.c
new file mode 100644 (file)
index 0000000..7e8cd44
--- /dev/null
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+/* Common symbols. zstd_compress must depend on zstd_decompress. */
+
+unsigned int zstd_is_error(size_t code)
+{
+       return ZSTD_isError(code);
+}
+EXPORT_SYMBOL(zstd_is_error);
+
+zstd_error_code zstd_get_error_code(size_t code)
+{
+       return ZSTD_getErrorCode(code);
+}
+EXPORT_SYMBOL(zstd_get_error_code);
+
+const char *zstd_get_error_name(size_t code)
+{
+       return ZSTD_getErrorName(code);
+}
+EXPORT_SYMBOL(zstd_get_error_name);
+
+/* Decompression symbols. */
+
+size_t zstd_dctx_workspace_bound(void)
+{
+       return ZSTD_estimateDCtxSize();
+}
+EXPORT_SYMBOL(zstd_dctx_workspace_bound);
+
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size)
+{
+       if (workspace == NULL)
+               return NULL;
+       return ZSTD_initStaticDCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dctx);
+
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+       const void *src, size_t src_size)
+{
+       return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_decompress_dctx);
+
+size_t zstd_dstream_workspace_bound(size_t max_window_size)
+{
+       return ZSTD_estimateDStreamSize(max_window_size);
+}
+EXPORT_SYMBOL(zstd_dstream_workspace_bound);
+
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+       size_t workspace_size)
+{
+       if (workspace == NULL)
+               return NULL;
+       (void)max_window_size;
+       return ZSTD_initStaticDStream(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dstream);
+
+size_t zstd_reset_dstream(zstd_dstream *dstream)
+{
+       return ZSTD_resetDStream(dstream);
+}
+EXPORT_SYMBOL(zstd_reset_dstream);
+
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+       zstd_in_buffer *input)
+{
+       return ZSTD_decompressStream(dstream, output, input);
+}
+EXPORT_SYMBOL(zstd_decompress_stream);
+
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size)
+{
+       return ZSTD_findFrameCompressedSize(src, src_size);
+}
+EXPORT_SYMBOL(zstd_find_frame_compressed_size);
+
+size_t zstd_get_frame_header(zstd_frame_header *header, const void *src,
+       size_t src_size)
+{
+       return ZSTD_getFrameHeader(header, src, src_size);
+}
+EXPORT_SYMBOL(zstd_get_frame_header);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
index 42d3fb25b0fb0c4c173b0352ca25dc5229152bcb..8138f203b2ae7b09bfdd3fce07c3359c15bdea06 100644 (file)
@@ -4,7 +4,6 @@ final: prev: {
                tools = final.callPackage ../default.nix {
                        testWithValgrind = false;
                        filter = filter.lib;
-                       lastModified = builtins.substring 0 8 self.lastModifiedDate;
                        versionString = self.version;
                };
                toolsValgrind = final.bcachefs.tools.override {
diff --git a/qcow2.c b/qcow2.c
index 7cf4992fbd9ffd43bd1d7478bf85d1e41dbfe5ee..d01fa9417088198fee5310a705abd0e42ac3b131 100644 (file)
--- a/qcow2.c
+++ b/qcow2.c
@@ -94,7 +94,7 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
        ranges_sort_merge(data);
 
        /* Write data: */
-       darray_foreach(r, *data)
+       darray_for_each(*data, r)
                for (src_offset = r->start;
                     src_offset < r->end;
                     src_offset += block_size) {
diff --git a/shell.nix b/shell.nix
new file mode 100644 (file)
index 0000000..fc7929d
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,18 @@
+{ kversion ? "linux_5_15"
+, pkgs ? import <nixpkgs> {} }:
+
+with pkgs;
+
+let
+  tools = pkgs.callPackage ./default.nix { doCheck = false ;} ;
+in
+mkShell {
+  buildInputs = [
+    linuxKernel.packages.${kversion}.perf
+    gdb
+    ccls # code completion in neovim/emacs
+  ];
+  inputsFrom = [
+    tools
+  ];
+}
index d83e0529e8a67025bae097596332da57d9484628..612a08eefb3801bc99049716b2aa8c5693d2b90d 100644 (file)
    ...
    fun:call_rcu_data_init
 }
+{
+   urcu_memb_call_rcu
+   Memcheck:Leak
+   match-leak-kinds: possible
+   ...
+   fun:pthread_create*
+   obj:/*/liburcu.so.*
+   ...
+   fun:urcu_memb_call_rcu
+}
+{
+   pthread_create
+   Memcheck:Leak
+   match-leak-kinds: possible
+   fun:calloc
+   ...
+   fun:allocate_stack
+   fun:pthread_create*
+   fun:kthread_create
+   fun:bch2_rebalance_start
+}
index 9491779baffb59e5c83f8d3e9fde10285b1bc8be..f29d202618db4623e73284eafa8ec47eadae5ced 100644 (file)
@@ -126,63 +126,19 @@ struct stat xstat(const char *path)
        return statbuf;
 }
 
-/* Formatting: */
-
-int printf_pad(unsigned pad, const char * fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = vprintf(fmt, args);
-       va_end(args);
-
-       while (ret++ < pad)
-              putchar(' ');
-
-       return ret;
-}
+/* File parsing (i.e. sysfs) */
 
-struct units_buf __pr_units(s64 _v, enum units units)
+void write_file_str(int dirfd, const char *path, const char *str)
 {
-       struct units_buf ret;
-       char *out = ret.b, *end = out + sizeof(ret.b);
-       u64 v = _v;
+       int fd = xopenat(dirfd, path, O_WRONLY);
+       ssize_t wrote, len = strlen(str);
 
-       if (_v < 0) {
-               out += scnprintf(out, end - out, "-");
-               v = -_v;
-       }
-
-       switch (units) {
-       case BYTES:
-               snprintf(out, end - out, "%llu", v << 9);
-               break;
-       case SECTORS:
-               snprintf(out, end - out, "%llu", v);
-               break;
-       case HUMAN_READABLE:
-               v <<= 9;
-
-               if (v >= 1024) {
-                       int exp = log(v) / log(1024);
-                       snprintf(out, end - out, "%.1f%c",
-                                v / pow(1024, exp),
-                                "KMGTPE"[exp-1]);
-               } else {
-                       snprintf(out, end - out, "%llu", v);
-               }
-
-               break;
-       }
-
-       return ret;
+       wrote = write(fd, str, len);
+       if (wrote != len)
+               die("read error: %m");
+       close(fd);
 }
 
-/* Argument parsing stuff: */
-
-/* File parsing (i.e. sysfs) */
-
 char *read_file_str(int dirfd, const char *path)
 {
        int fd = xopenat(dirfd, path, O_RDONLY);
@@ -331,22 +287,21 @@ static int range_cmp(const void *_l, const void *_r)
 void ranges_sort_merge(ranges *r)
 {
        struct range *t, *i;
-       ranges tmp = { NULL };
+       ranges tmp = { 0 };
 
-       sort(&darray_item(*r, 0), darray_size(*r),
-            sizeof(darray_item(*r, 0)), range_cmp, NULL);
+       sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL);
 
        /* Merge contiguous ranges: */
-       darray_foreach(i, *r) {
-               t = tmp.size ?  &tmp.item[tmp.size - 1] : NULL;
+       darray_for_each(*r, i) {
+               t = tmp.nr ?  &tmp.data[tmp.nr - 1] : NULL;
 
                if (t && t->end >= i->start)
                        t->end = max(t->end, i->end);
                else
-                       darray_append(tmp, *i);
+                       darray_push(&tmp, *i);
        }
 
-       darray_free(*r);
+       darray_exit(r);
        *r = tmp;
 }
 
@@ -354,7 +309,7 @@ void ranges_roundup(ranges *r, unsigned block_size)
 {
        struct range *i;
 
-       darray_foreach(i, *r) {
+       darray_for_each(*r, i) {
                i->start = round_down(i->start, block_size);
                i->end  = round_up(i->end, block_size);
        }
@@ -364,7 +319,7 @@ void ranges_rounddown(ranges *r, unsigned block_size)
 {
        struct range *i;
 
-       darray_foreach(i, *r) {
+       darray_for_each(*r, i) {
                i->start = round_up(i->start, block_size);
                i->end  = round_down(i->end, block_size);
                i->end  = max(i->end, i->start);
index 9468f070f3729d369de681a413199b5156c0e8cd..d1122f5d1556d34970d5704c06dedb8eb832b583 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
-#include "ccan/darray/darray.h"
+#include "libbcachefs/darray.h"
 
 #define noreturn __attribute__((noreturn))
 
@@ -53,22 +53,7 @@ struct stat xstat(const char *);
        _ret;                                                           \
 })
 
-int printf_pad(unsigned pad, const char * fmt, ...);
-
-enum units {
-       BYTES,
-       SECTORS,
-       HUMAN_READABLE,
-};
-
-struct units_buf __pr_units(s64, enum units);
-
-struct units_buf {
-       char    b[20];
-};
-
-#define pr_units(_v, _u)       &(__pr_units(_v, _u).b[0])
-
+void write_file_str(int, const char *, const char *);
 char *read_file_str(int, const char *);
 u64 read_file_u64(int, const char *);
 
@@ -86,14 +71,14 @@ struct range {
        u64             end;
 };
 
-typedef darray(struct range) ranges;
+typedef DARRAY(struct range) ranges;
 
 static inline void range_add(ranges *data, u64 offset, u64 size)
 {
-       darray_append(*data, (struct range) {
+       darray_push(data, ((struct range) {
                .start = offset,
                .end = offset + size
-       });
+       }));
 }
 
 void ranges_sort_merge(ranges *);
@@ -109,9 +94,9 @@ struct hole_iter {
 static inline struct range hole_iter_next(struct hole_iter *iter)
 {
        struct range r = {
-               .start  = iter->idx ? iter->r.item[iter->idx - 1].end : 0,
-               .end    = iter->idx < iter->r.size
-                       ? iter->r.item[iter->idx].start : iter->end,
+               .start  = iter->idx ? iter->r.data[iter->idx - 1].end : 0,
+               .end    = iter->idx < iter->r.nr
+                       ? iter->r.data[iter->idx].start : iter->end,
        };
 
        BUG_ON(r.start > r.end);
@@ -122,7 +107,7 @@ static inline struct range hole_iter_next(struct hole_iter *iter)
 
 #define for_each_hole(_iter, _ranges, _end, _i)                                \
        for (_iter = (struct hole_iter) { .r = _ranges, .end = _end };  \
-            (_iter.idx <= _iter.r.size &&                              \
+            (_iter.idx <= _iter.r.nr &&                                \
              (_i = hole_iter_next(&_iter), true));)
 
 #include <linux/fiemap.h>