]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to ea93c26e98 fixup! bcachefs: We can handle missing btree...
authorKent Overstreet <kent.overstreet@linux.dev>
Thu, 9 Feb 2023 23:34:08 +0000 (18:34 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Thu, 9 Feb 2023 23:36:24 +0000 (18:36 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
105 files changed:
.bcachefs_revision
Makefile
Makefile.compiler
include/linux/kernel.h
include/linux/mean_and_variance.h
include/linux/poison.h
include/linux/prandom.h
include/linux/sched.h
include/linux/seq_buf.h [new file with mode: 0644]
include/linux/shrinker.h
include/linux/six.h
include/linux/slab.h
include/linux/wait.h
include/trace/events/bcachefs.h
include/trace/events/lock.h [new file with mode: 0644]
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/backpointers.c
libbcachefs/backpointers.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.c
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets_waiting_for_journal.c
libbcachefs/buckets_waiting_for_journal_types.h
libbcachefs/data_update.c
libbcachefs/data_update.h
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/errcode.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_types.h
libbcachefs/lru.c
libbcachefs/lru.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/nocow_locking.c
libbcachefs/nocow_locking.h
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/printbuf.c [moved from linux/printbuf.c with 72% similarity]
libbcachefs/printbuf.h [moved from include/linux/printbuf.h with 76% similarity]
libbcachefs/quota.c
libbcachefs/quota.h
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/replicas_types.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/xattr.c
libbcachefs/xattr.h
linux/blkdev.c
linux/mean_and_variance.c
linux/pretty-printers.c [deleted file]
linux/printbuf_userspace.c [deleted file]
linux/seq_buf.c [new file with mode: 0644]
linux/six.c
linux/string_helpers.c

index 615d94b8ae9b15c82636b4cbc9afa17960f4d8e0..3ca1265b4cf436de5e94c02a787f5968be51d9e1 100644 (file)
@@ -1 +1 @@
-0939e1c73231c779c961e1143e1ba489ef2b168c
+ea93c26e98081d8e1a5fc138e6334b3631983d77
index 743f6ca933488d0578fd62e7621abaeb43b6d82f..bce10d5b76dd487a7c9392539b6f1665e441626a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -221,14 +221,6 @@ update-bcachefs-sources:
        git add linux/generic-radix-tree.c
        cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/
        git add include/linux/kmemleak.h
-       cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/
-       git add include/linux/printbuf.h
-       cp $(LINUX_DIR)/lib/printbuf.c linux/
-       git add linux/printbuf.c
-       cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
-       git add linux/mean_and_variance.c
-       cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
-       git add include/linux/mean_and_variance.h
        cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
        git add linux/int_sqrt.c
        cp $(LINUX_DIR)/scripts/Makefile.compiler ./
index 94d0d40cddb3d614facd505e46d4d83cbc7f7e10..20d353dcabfbc50b0419936724c57a4a6f2f22e3 100644 (file)
@@ -61,9 +61,13 @@ cc-option-yn = $(call try-run,\
 cc-disable-warning = $(call try-run,\
        $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
-# cc-ifversion
-# Usage:  EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1)
-cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || echo $(4))
+# gcc-min-version
+# Usage: cflags-$(call gcc-min-version, 70100) += -foo
+gcc-min-version = $(shell [ $(CONFIG_GCC_VERSION)0 -ge $(1)0 ] && echo y)
+
+# clang-min-version
+# Usage: cflags-$(call clang-min-version, 110000) += -foo
+clang-min-version = $(shell [ $(CONFIG_CLANG_VERSION)0 -ge $(1)0 ] && echo y)
 
 # ld-option
 # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
index b2c1751c56e2fcdffb87d7ef34c49571a0f11e36..a21b7cc3041cb032f2f173efedd4c8c59e8717ac 100644 (file)
@@ -264,4 +264,7 @@ struct qstr {
 
 static inline void dump_stack(void) {}
 
+#define unsafe_memcpy(dst, src, bytes, justification)          \
+       memcpy(dst, src, bytes)
+
 #endif
index b7fa5e96562f30b417cc55822842af24b870e220..756eb3d1ca641a2acebf5d52b05ebb551eaad5f2 100644 (file)
@@ -2,13 +2,35 @@
 #ifndef MEAN_AND_VARIANCE_H_
 #define MEAN_AND_VARIANCE_H_
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/limits.h>
 #include <linux/math64.h>
-#include <linux/printbuf.h>
 
 #define SQRT_U64_MAX 4294967295ULL
 
+/**
+ * abs - return absolute value of an argument
+ * @x: the value.  If it is unsigned type, it is converted to signed type first.
+ *     char is treated as if it was signed (regardless of whether it really is)
+ *     but the macro's return type is preserved as char.
+ *
+ * Return: an absolute value of x.
+ */
+#define abs(x) __abs_choose_expr(x, long long,                         \
+               __abs_choose_expr(x, long,                              \
+               __abs_choose_expr(x, int,                               \
+               __abs_choose_expr(x, short,                             \
+               __abs_choose_expr(x, char,                              \
+               __builtin_choose_expr(                                  \
+                       __builtin_types_compatible_p(typeof(x), char),  \
+                       (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+                       ((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(       \
+       __builtin_types_compatible_p(typeof(x),   signed type) ||       \
+       __builtin_types_compatible_p(typeof(x), unsigned type),         \
+       ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
 
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
index d62ef5a6b4e9c624383cd92b3773a9ea44e2c500..2d3249eb0e62d4b29f03e6e03a21934f555278aa 100644 (file)
@@ -81,4 +81,7 @@
 /********** net/core/page_pool.c **********/
 #define PP_SIGNATURE           (0x40 + POISON_POINTER_DELTA)
 
+/********** kernel/bpf/ **********/
+#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
+
 #endif
index 6f177cddda399cf03916de4034eeadfe7f4b683f..9aea22dc92e745c71e3cc4d686c8a169498c555f 100644 (file)
@@ -23,5 +23,11 @@ prandom_type(u32);
 prandom_type(u64);
 #undef prandom_type
 
+static inline u32 prandom_u32_max(u32 max)
+{
+       return prandom_u32() % max;
+
+}
+
 #endif /* _LINUX_PRANDOM_H */
 
index ac6d27bb6b3bb3079c399330d7ec965408482676..fef7e323f4701fe04009d1791553823854f8f3a8 100644 (file)
@@ -28,6 +28,7 @@
 #define TASK_NEW               2048
 #define TASK_IDLE_WORKER       4096
 #define TASK_STATE_MAX         8192
+#define TASK_FREEZABLE         (1U << 14)
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE          (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
new file mode 100644 (file)
index 0000000..8c9c0dd
--- /dev/null
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SEQ_BUF_H
+#define _LINUX_SEQ_BUF_H
+
+#include <linux/kernel.h>
+#include <stdarg.h>
+#include <string.h>
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use.
+ */
+
+/**
+ * seq_buf - seq buffer structure
+ * @buffer:    pointer to the buffer
+ * @size:      size of the buffer
+ * @len:       the amount of data inside the buffer
+ * @readpos:   The next position to read in the buffer.
+ */
+struct seq_buf {
+       char                    *buffer;
+       size_t                  size;
+       size_t                  len;
+       loff_t                  readpos;
+};
+
+static inline void seq_buf_clear(struct seq_buf *s)
+{
+       s->len = 0;
+       s->readpos = 0;
+}
+
+static inline void
+seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
+{
+       s->buffer = buf;
+       s->size = size;
+       seq_buf_clear(s);
+}
+
+/*
+ * seq_buf have a buffer that might overflow. When this happens
+ * the len and size are set to be equal.
+ */
+static inline bool
+seq_buf_has_overflowed(struct seq_buf *s)
+{
+       return s->len > s->size;
+}
+
+static inline void
+seq_buf_set_overflow(struct seq_buf *s)
+{
+       s->len = s->size + 1;
+}
+
+/*
+ * How much buffer is left on the seq_buf?
+ */
+static inline unsigned int
+seq_buf_buffer_left(struct seq_buf *s)
+{
+       if (seq_buf_has_overflowed(s))
+               return 0;
+
+       return s->size - s->len;
+}
+
+/* How much buffer was written? */
+static inline unsigned int seq_buf_used(struct seq_buf *s)
+{
+       return min(s->len, s->size);
+}
+
+/**
+ * seq_buf_terminate - Make sure buffer is nul terminated
+ * @s: the seq_buf descriptor to terminate.
+ *
+ * This makes sure that the buffer in @s is nul terminated and
+ * safe to read as a string.
+ *
+ * Note, if this is called when the buffer has overflowed, then
+ * the last byte of the buffer is zeroed, and the len will still
+ * point passed it.
+ *
+ * After this function is called, s->buffer is safe to use
+ * in string operations.
+ */
+static inline void seq_buf_terminate(struct seq_buf *s)
+{
+       if (WARN_ON(s->size == 0))
+               return;
+
+       if (seq_buf_buffer_left(s))
+               s->buffer[s->len] = 0;
+       else
+               s->buffer[s->size - 1] = 0;
+}
+
+/**
+ * seq_buf_get_buf - get buffer to write arbitrary data to
+ * @s: the seq_buf handle
+ * @bufp: the beginning of the buffer is stored here
+ *
+ * Return the number of bytes available in the buffer, or zero if
+ * there's no space.
+ */
+static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp)
+{
+       WARN_ON(s->len > s->size + 1);
+
+       if (s->len < s->size) {
+               *bufp = s->buffer + s->len;
+               return s->size - s->len;
+       }
+
+       *bufp = NULL;
+       return 0;
+}
+
+/**
+ * seq_buf_commit - commit data to the buffer
+ * @s: the seq_buf handle
+ * @num: the number of bytes to commit
+ *
+ * Commit @num bytes of data written to a buffer previously acquired
+ * by seq_buf_get.  To signal an error condition, or that the data
+ * didn't fit in the available space, pass a negative @num value.
+ */
+static inline void seq_buf_commit(struct seq_buf *s, int num)
+{
+       if (num < 0) {
+               seq_buf_set_overflow(s);
+       } else {
+               /* num must be negative on overflow */
+               BUG_ON(s->len + num > s->size);
+               s->len += num;
+       }
+}
+
+extern __printf(2, 3)
+int seq_buf_printf(struct seq_buf *s, const char *fmt, ...);
+extern __printf(2, 0)
+int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
+extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
+                          int cnt);
+extern int seq_buf_puts(struct seq_buf *s, const char *str);
+extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
+
+void seq_buf_human_readable_u64(struct seq_buf *, u64);
+
+#endif /* _LINUX_SEQ_BUF_H */
index ebbab7a68c925a444c910922a97b338492b23cb0..bca00d61c27f7c00c30eb19f2dd59fcc8548b662 100644 (file)
@@ -11,13 +11,13 @@ struct shrink_control {
 
 #define SHRINK_STOP (~0UL)
 
-struct printbuf;
+struct seq_buf;
 struct shrinker {
        unsigned long (*count_objects)(struct shrinker *,
                                       struct shrink_control *sc);
        unsigned long (*scan_objects)(struct shrinker *,
                                      struct shrink_control *sc);
-       void (*to_text)(struct printbuf *, struct shrinker *);
+       void (*to_text)(struct seq_buf *, struct shrinker *);
 
        int seeks;      /* seeks to recreate an obj */
        long batch;     /* reclaim batch size, 0 = default */
index 362a577b968e9da0eb5a54578934ed0dd5963fc0..16ad2073f71c551004eb2327bd28040b57c38676 100644 (file)
@@ -59,6 +59,7 @@
  */
 
 #include <linux/lockdep.h>
+#include <linux/osq_lock.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 
@@ -79,9 +80,10 @@ union six_lock_state {
        };
 
        struct {
-               unsigned        read_lock:27;
+               unsigned        read_lock:26;
                unsigned        write_locking:1;
                unsigned        intent_lock:1;
+               unsigned        nospin:1;
                unsigned        waiters:3;
                /*
                 * seq works much like in seqlocks: it's incremented every time
@@ -104,10 +106,10 @@ enum six_lock_type {
 
 struct six_lock {
        union six_lock_state    state;
+       unsigned                intent_lock_recurse;
        struct task_struct      *owner;
        unsigned __percpu       *readers;
-       unsigned                intent_lock_recurse;
-       unsigned long           ip;
+       struct optimistic_spin_queue osq;
        raw_spinlock_t          wait_lock;
        struct list_head        wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -148,12 +150,37 @@ do {                                                                      \
 #define __SIX_VAL(field, _v)   (((union six_lock_state) { .field = _v }).v)
 
 #define __SIX_LOCK(type)                                               \
-bool six_trylock_##type(struct six_lock *);                            \
-bool six_relock_##type(struct six_lock *, u32);                                \
-int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
-int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *,        \
-                          six_lock_should_sleep_fn, void *);           \
-void six_unlock_##type(struct six_lock *);
+bool six_trylock_ip_##type(struct six_lock *, unsigned long);          \
+bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);      \
+int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn,    \
+                      void *, unsigned long);                          \
+int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
+                       six_lock_should_sleep_fn, void *, unsigned long);\
+void six_unlock_ip_##type(struct six_lock *, unsigned long);           \
+                                                                       \
+static inline bool six_trylock_##type(struct six_lock *lock)           \
+{                                                                      \
+       return six_trylock_ip_##type(lock, _THIS_IP_);                  \
+}                                                                      \
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)   \
+{                                                                      \
+       return six_relock_ip_##type(lock, seq, _THIS_IP_);              \
+}                                                                      \
+static inline int six_lock_##type(struct six_lock *lock,               \
+                                 six_lock_should_sleep_fn fn, void *p)\
+{                                                                      \
+       return six_lock_ip_##type(lock, fn, p, _THIS_IP_);              \
+}                                                                      \
+static inline int six_lock_waiter_##type(struct six_lock *lock,                \
+                       struct six_lock_waiter *wait,                   \
+                       six_lock_should_sleep_fn fn, void *p)           \
+{                                                                      \
+       return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_); \
+}                                                                      \
+static inline void six_unlock_##type(struct six_lock *lock)            \
+{                                                                      \
+       return six_unlock_ip_##type(lock, _THIS_IP_);                   \
+}
 
 __SIX_LOCK(read)
 __SIX_LOCK(intent)
@@ -189,6 +216,14 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
        SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
 }
 
+static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+                               struct six_lock_waiter *wait,
+                               six_lock_should_sleep_fn should_sleep_fn, void *p,
+                               unsigned long ip)
+{
+       SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
+}
+
 static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
                                struct six_lock_waiter *wait,
                                six_lock_should_sleep_fn should_sleep_fn, void *p)
index cf48570c1580e0ea092a4380bbc2a650829b496c..ff122ff9abda259e6ad8574b7544c5491e875f45 100644 (file)
@@ -174,6 +174,11 @@ static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
        return kmalloc(c->obj_size, gfp);
 }
 
+static inline void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t gfp)
+{
+       return kzalloc(c->obj_size, gfp);
+}
+
 static inline void kmem_cache_free(struct kmem_cache *c, void *p)
 {
        kfree(p);
index d30fb10d63f5b38b6076015d9d97ab47755eb931..4b9cbf38d52afef017d59d215bfe4006798afbf0 100644 (file)
@@ -18,10 +18,12 @@ struct __wait_queue {
        struct list_head        task_list;
 };
 
-typedef struct {
+struct wait_queue_head {
        spinlock_t              lock;
        struct list_head        task_list;
-} wait_queue_head_t;
+};
+
+typedef struct wait_queue_head wait_queue_head_t;
 
 void wake_up(wait_queue_head_t *);
 void wake_up_all(wait_queue_head_t *);
@@ -42,7 +44,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke
        .task_list      = { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
-       wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+       struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
 
 static inline void init_waitqueue_head(wait_queue_head_t *q)
 {
index f699146aad2640795aef73c6cad912cc13beb7f8..ca5d6c8a4ddbe7503e15cdb6af41abe5c6184d68 100644 (file)
@@ -514,34 +514,10 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
 
 /* Allocator */
 
-TRACE_EVENT(bucket_alloc,
-       TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-                bool user, u64 bucket),
-       TP_ARGS(ca, alloc_reserve, user, bucket),
-
-       TP_STRUCT__entry(
-               __field(dev_t,                  dev     )
-               __array(char,   reserve,        16      )
-               __field(bool,                   user    )
-               __field(u64,                    bucket  )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = ca->dev;
-               strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
-               __entry->user           = user;
-               __entry->bucket         = bucket;
-       ),
-
-       TP_printk("%d,%d reserve %s user %u bucket %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->reserve,
-                 __entry->user,
-                 __entry->bucket)
-);
-
-TRACE_EVENT(bucket_alloc_fail,
+DECLARE_EVENT_CLASS(bucket_alloc,
        TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+                bool user,
+                u64 bucket,
                 u64 free,
                 u64 avail,
                 u64 copygc_wait_amount,
@@ -549,12 +525,15 @@ TRACE_EVENT(bucket_alloc_fail,
                 struct bucket_alloc_state *s,
                 bool nonblocking,
                 const char *err),
-       TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+               copygc_wait_amount, copygc_waiting_for,
                s, nonblocking, err),
 
        TP_STRUCT__entry(
                __field(dev_t,                  dev                     )
                __array(char,   reserve,        16                      )
+               __field(bool,                   user    )
+               __field(u64,                    bucket  )
                __field(u64,                    free                    )
                __field(u64,                    avail                   )
                __field(u64,                    copygc_wait_amount      )
@@ -571,6 +550,8 @@ TRACE_EVENT(bucket_alloc_fail,
        TP_fast_assign(
                __entry->dev            = ca->dev;
                strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+               __entry->user           = user;
+               __entry->bucket         = bucket;
                __entry->free           = free;
                __entry->avail          = avail;
                __entry->copygc_wait_amount     = copygc_wait_amount;
@@ -584,9 +565,11 @@ TRACE_EVENT(bucket_alloc_fail,
                strscpy(__entry->err, err, sizeof(__entry->err));
        ),
 
-       TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u nocow %llu err %s",
+       TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->reserve,
+                 __entry->user,
+                 __entry->bucket,
                  __entry->free,
                  __entry->avail,
                  __entry->copygc_wait_amount,
@@ -595,11 +578,43 @@ TRACE_EVENT(bucket_alloc_fail,
                  __entry->open,
                  __entry->need_journal_commit,
                  __entry->nouse,
-                 __entry->nonblocking,
                  __entry->nocow,
+                 __entry->nonblocking,
                  __entry->err)
 );
 
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+       TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+                bool user,
+                u64 bucket,
+                u64 free,
+                u64 avail,
+                u64 copygc_wait_amount,
+                s64 copygc_waiting_for,
+                struct bucket_alloc_state *s,
+                bool nonblocking,
+                const char *err),
+       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+               copygc_wait_amount, copygc_waiting_for,
+               s, nonblocking, err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+       TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+                bool user,
+                u64 bucket,
+                u64 free,
+                u64 avail,
+                u64 copygc_wait_amount,
+                s64 copygc_waiting_for,
+                struct bucket_alloc_state *s,
+                bool nonblocking,
+                const char *err),
+       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+               copygc_wait_amount, copygc_waiting_for,
+               s, nonblocking, err)
+);
+
 TRACE_EVENT(discard_buckets,
        TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
                 u64 need_journal_commit, u64 discarded, const char *err),
@@ -673,7 +688,7 @@ DEFINE_EVENT(bkey, move_extent_finish,
        TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_extent_race,
+DEFINE_EVENT(bkey, move_extent_fail,
        TP_PROTO(const struct bkey *k),
        TP_ARGS(k)
 );
diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
new file mode 100644 (file)
index 0000000..9ebd081
--- /dev/null
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCK_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+/* flags for lock:contention_begin */
+#define LCB_F_SPIN     (1U << 0)
+#define LCB_F_READ     (1U << 1)
+#define LCB_F_WRITE    (1U << 2)
+#define LCB_F_RT       (1U << 3)
+#define LCB_F_PERCPU   (1U << 4)
+#define LCB_F_MUTEX    (1U << 5)
+
+
+#ifdef CONFIG_LOCKDEP
+
+#include <linux/lockdep.h>
+
+TRACE_EVENT(lock_acquire,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+               int trylock, int read, int check,
+               struct lockdep_map *next_lock, unsigned long ip),
+
+       TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, flags)
+               __string(name, lock->name)
+               __field(void *, lockdep_addr)
+       ),
+
+       TP_fast_assign(
+               __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+               __assign_str(name, lock->name);
+               __entry->lockdep_addr = lock;
+       ),
+
+       TP_printk("%p %s%s%s", __entry->lockdep_addr,
+                 (__entry->flags & 1) ? "try " : "",
+                 (__entry->flags & 2) ? "read " : "",
+                 __get_str(name))
+);
+
+DECLARE_EVENT_CLASS(lock,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip),
+
+       TP_STRUCT__entry(
+               __string(       name,   lock->name      )
+               __field(        void *, lockdep_addr    )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+               __entry->lockdep_addr = lock;
+       ),
+
+       TP_printk("%p %s",  __entry->lockdep_addr, __get_str(name))
+);
+
+DEFINE_EVENT(lock, lock_release,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip)
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+DEFINE_EVENT(lock, lock_contended,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip)
+);
+
+DEFINE_EVENT(lock, lock_acquired,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip)
+);
+
+#endif /* CONFIG_LOCK_STAT */
+#endif /* CONFIG_LOCKDEP */
+
+TRACE_EVENT(contention_begin,
+
+       TP_PROTO(void *lock, unsigned int flags),
+
+       TP_ARGS(lock, flags),
+
+       TP_STRUCT__entry(
+               __field(void *, lock_addr)
+               __field(unsigned int, flags)
+       ),
+
+       TP_fast_assign(
+               __entry->lock_addr = lock;
+               __entry->flags = flags;
+       ),
+
+       TP_printk("%p (flags=%s)", __entry->lock_addr,
+                 __print_flags(__entry->flags, "|",
+                               { LCB_F_SPIN,           "SPIN" },
+                               { LCB_F_READ,           "READ" },
+                               { LCB_F_WRITE,          "WRITE" },
+                               { LCB_F_RT,             "RT" },
+                               { LCB_F_PERCPU,         "PERCPU" },
+                               { LCB_F_MUTEX,          "MUTEX" }
+                         ))
+);
+
+TRACE_EVENT(contention_end,
+
+       TP_PROTO(void *lock, int ret),
+
+       TP_ARGS(lock, ret),
+
+       TP_STRUCT__entry(
+               __field(void *, lock_addr)
+               __field(int, ret)
+       ),
+
+       TP_fast_assign(
+               __entry->lock_addr = lock;
+               __entry->ret = ret;
+       ),
+
+       TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
+);
+
+#endif /* _TRACE_LOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index a78232ed87f1fa962a77d19210ce04e952991988..6fd948f11c0eb284126b85a2bf7405df58999870 100644 (file)
@@ -222,7 +222,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 }
 
 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -237,7 +237,7 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_alloc_unpacked u;
 
@@ -250,7 +250,7 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_alloc_unpacked u;
 
@@ -263,9 +263,10 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+       int rw = flags & WRITE;
 
        if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
                prt_printf(err, "bad val size (%lu != %u)",
@@ -279,11 +280,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       /*
-        * XXX this is wrong, we'll be checking updates that happened from
-        * before BCH_FS_CHECK_BACKPOINTERS_DONE
-        */
-       if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+       if (rw == WRITE &&
+           !(flags & BKEY_INVALID_FROM_JOURNAL) &&
+           test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
                unsigned i, bp_len = 0;
 
                for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
@@ -621,7 +620,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
 }
 
 int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                            int rw, struct printbuf *err)
+                            unsigned flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
                prt_printf(err, "bad val size (%lu != %zu)",
@@ -1607,7 +1606,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
        struct bch_dev *ca;
        struct bkey_i_alloc_v4 *a;
        struct printbuf buf = PRINTBUF;
-       bool did_discard = false;
        int ret = 0;
 
        ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1683,15 +1681,13 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
                                     k.k->p.offset * ca->mi.bucket_size,
                                     ca->mi.bucket_size,
                                     GFP_KERNEL);
+               *discard_pos_done = iter.pos;
 
-               ret = bch2_trans_relock(trans);
+               ret = bch2_trans_relock_notrace(trans);
                if (ret)
                        goto out;
        }
 
-       *discard_pos_done = iter.pos;
-       did_discard = true;
-
        SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
        a->v.data_type = alloc_data_type(a->v, a->v.data_type);
 write:
@@ -1701,11 +1697,10 @@ write:
        if (ret)
                goto out;
 
-       if (did_discard) {
-               this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
-               (*discarded)++;
-       }
+       this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+       (*discarded)++;
 out:
+       (*seen)++;
        bch2_trans_iter_exit(trans, &iter);
        percpu_ref_put(&ca->io_ref);
        printbuf_exit(&buf);
@@ -1742,7 +1737,7 @@ static void bch2_do_discards_work(struct work_struct *work)
        if (need_journal_commit * 2 > seen)
                bch2_journal_flush_async(&c->journal, NULL);
 
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 
        trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
                              bch2_err_str(ret));
@@ -1750,44 +1745,45 @@ static void bch2_do_discards_work(struct work_struct *work)
 
 void bch2_do_discards(struct bch_fs *c)
 {
-       if (percpu_ref_tryget_live(&c->writes) &&
+       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
            !queue_work(system_long_wq, &c->discard_work))
-               percpu_ref_put(&c->writes);
+               bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
 static int invalidate_one_bucket(struct btree_trans *trans,
                                 struct btree_iter *lru_iter,
-                                struct bpos bucket,
+                                struct bkey_s_c lru_k,
                                 s64 *nr_to_invalidate)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter alloc_iter = { NULL };
-       struct bkey_i_alloc_v4 *a;
+       struct bkey_i_alloc_v4 *a = NULL;
        struct printbuf buf = PRINTBUF;
+       struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
        unsigned cached_sectors;
        int ret = 0;
 
        if (*nr_to_invalidate <= 0)
                return 1;
 
+       if (!bch2_dev_bucket_exists(c, bucket)) {
+               prt_str(&buf, "lru entry points to invalid bucket");
+               goto err;
+       }
+
        a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto out;
 
        if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
-               prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-               bch2_bpos_to_text(&buf, lru_iter->pos);
-               prt_printf(&buf, "\n  ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
-               bch_err(c, "%s", buf.buf);
-               if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-                       bch2_inconsistent_error(c);
-                       ret = -EINVAL;
-               }
+               prt_str(&buf, "alloc key does not point back to lru entry when invalidating bucket:");
+               goto err;
+       }
 
-               goto out;
+       if (a->v.data_type != BCH_DATA_cached) {
+               prt_str(&buf, "lru entry points to non cached bucket:");
+               goto err;
        }
 
        if (!a->v.cached_sectors)
@@ -1816,6 +1812,26 @@ out:
        bch2_trans_iter_exit(trans, &alloc_iter);
        printbuf_exit(&buf);
        return ret;
+err:
+       prt_str(&buf, "\n  lru key: ");
+       bch2_bkey_val_to_text(&buf, c, lru_k);
+
+       prt_str(&buf, "\n  lru entry: ");
+       bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+       prt_str(&buf, "\n  alloc key: ");
+       if (!a)
+               bch2_bpos_to_text(&buf, bucket);
+       else
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+       bch_err(c, "%s", buf.buf);
+       if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+               bch2_inconsistent_error(c);
+               ret = -EINVAL;
+       }
+
+       goto out;
 }
 
 static void bch2_do_invalidates_work(struct work_struct *work)
@@ -1838,9 +1854,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                                lru_pos(ca->dev_idx, 0, 0),
                                lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
                                BTREE_ITER_INTENT, k,
-                       invalidate_one_bucket(&trans, &iter,
-                                             u64_to_bucket(k.k->p.offset),
-                                             &nr_to_invalidate));
+                       invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
 
                if (ret < 0) {
                        percpu_ref_put(&ca->ref);
@@ -1849,14 +1863,14 @@ static void bch2_do_invalidates_work(struct work_struct *work)
        }
 
        bch2_trans_exit(&trans);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
 void bch2_do_invalidates(struct bch_fs *c)
 {
-       if (percpu_ref_tryget_live(&c->writes) &&
+       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
            !queue_work(system_long_wq, &c->invalidate_work))
-               percpu_ref_put(&c->writes);
+               bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
index a0c3c47b49b5970a26ededf4269ecba87e30ddea..b3c2f1e0deb695b747a97792d273372c08f73d38 100644 (file)
@@ -122,10 +122,10 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -158,7 +158,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .atomic_trigger = bch2_mark_alloc,              \
 })
 
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
index f1cfb90b6d54c9cd6f5088cdbf9b11471ba1b06b..6eeeaec1a16d5d92e52b2f06d0240975fd98a46f 100644 (file)
@@ -58,6 +58,17 @@ const char * const bch2_alloc_reserves[] = {
  * reference _after_ doing the index update that makes its allocation reachable.
  */
 
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL)
+               ca->alloc_cursor = 0;
+       rcu_read_unlock();
+}
+
 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
 {
        open_bucket_idx_t idx = ob - c->open_buckets;
@@ -272,7 +283,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
        }
 
        spin_unlock(&c->freelist_lock);
-
        return ob;
 }
 
@@ -418,12 +428,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_s_c k;
        struct open_bucket *ob = NULL;
+       u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+       u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
        int ret;
-
-       s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket);
-       s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx);
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket),
+again:
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
                           BTREE_ITER_SLOTS, k, ret) {
                struct bch_alloc_v4 a_convert;
                const struct bch_alloc_v4 *a;
@@ -448,9 +457,17 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
        }
        bch2_trans_iter_exit(trans, &iter);
 
-       s->cur_bucket = iter.pos.offset;
+       ca->alloc_cursor = alloc_cursor;
+
+       if (!ob && ret)
+               ob = ERR_PTR(ret);
+
+       if (!ob && alloc_cursor > alloc_start) {
+               alloc_cursor = alloc_start;
+               goto again;
+       }
 
-       return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
+       return ob;
 }
 
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
@@ -462,33 +479,34 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_s_c k;
        struct open_bucket *ob = NULL;
+       u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+       u64 alloc_cursor = alloc_start;
        int ret;
 
        BUG_ON(ca->new_fs_bucket_idx);
-
-       /*
-        * XXX:
-        * On transaction restart, we'd like to restart from the bucket we were
-        * at previously
-        */
+again:
        for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
-                                    POS(ca->dev_idx, s->cur_bucket), 0, k, ret) {
+                                    POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
                if (k.k->p.inode != ca->dev_idx)
                        break;
 
-               for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k));
-                    s->cur_bucket < k.k->p.offset;
-                    s->cur_bucket++) {
+               for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+                    alloc_cursor < k.k->p.offset;
+                    alloc_cursor++) {
                        ret = btree_trans_too_many_iters(trans);
-                       if (ret)
+                       if (ret) {
+                               ob = ERR_PTR(ret);
                                break;
+                       }
 
                        s->buckets_seen++;
 
                        ob = try_alloc_bucket(trans, ca, reserve,
-                                             s->cur_bucket, s, k, cl);
-                       if (ob)
+                                             alloc_cursor, s, k, cl);
+                       if (ob) {
+                               iter.path->preserve = false;
                                break;
+                       }
                }
 
                if (ob || ret)
@@ -496,7 +514,17 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
        }
        bch2_trans_iter_exit(trans, &iter);
 
-       return ob ?: ERR_PTR(ret);
+       ca->alloc_cursor = alloc_cursor;
+
+       if (!ob && ret)
+               ob = ERR_PTR(ret);
+
+       if (!ob && alloc_start > ca->mi.first_bucket) {
+               alloc_cursor = alloc_start = ca->mi.first_bucket;
+               goto again;
+       }
+
+       return ob;
 }
 
 /**
@@ -514,9 +542,8 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct open_bucket *ob = NULL;
        bool freespace = READ_ONCE(ca->mi.freespace_initialized);
-       u64 start = freespace ? 0 : ca->bucket_alloc_trans_early_cursor;
        u64 avail;
-       struct bucket_alloc_state s = { .cur_bucket = start };
+       struct bucket_alloc_state s = { 0 };
        bool waiting = false;
 again:
        bch2_dev_usage_read_fast(ca, usage);
@@ -561,28 +588,31 @@ alloc:
        if (s.skipped_need_journal_commit * 2 > avail)
                bch2_journal_flush_async(&c->journal, NULL);
 
-       if (!ob && !freespace && start) {
-               start = s.cur_bucket = 0;
-               goto alloc;
-       }
-
        if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
                freespace = false;
                goto alloc;
        }
-
-       if (!freespace)
-               ca->bucket_alloc_trans_early_cursor = s.cur_bucket;
 err:
        if (!ob)
                ob = ERR_PTR(-BCH_ERR_no_buckets_found);
 
        if (!IS_ERR(ob))
-               trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve],
-                               may_alloc_partial, ob->bucket);
+               trace_and_count(c, bucket_alloc, ca,
+                               bch2_alloc_reserves[reserve],
+                               may_alloc_partial,
+                               ob->bucket,
+                               usage->d[BCH_DATA_free].buckets,
+                               avail,
+                               bch2_copygc_wait_amount(c),
+                               c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+                               &s,
+                               cl == NULL,
+                               "");
        else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
-               trace_and_count(c, bucket_alloc_fail,
-                               ca, bch2_alloc_reserves[reserve],
+               trace_and_count(c, bucket_alloc_fail, ca,
+                               bch2_alloc_reserves[reserve],
+                               may_alloc_partial,
+                               0,
                                usage->d[BCH_DATA_free].buckets,
                                avail,
                                bch2_copygc_wait_amount(c),
@@ -1130,16 +1160,16 @@ out:
  * Get us an open_bucket we can allocate from, return with it locked:
  */
 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
-                                  unsigned target,
-                                  unsigned erasure_code,
-                                  struct write_point_specifier write_point,
-                                  struct bch_devs_list *devs_have,
-                                  unsigned nr_replicas,
-                                  unsigned nr_replicas_required,
-                                  enum alloc_reserve reserve,
-                                  unsigned flags,
-                                  struct closure *cl,
-                                  struct write_point **wp_ret)
+                            unsigned target,
+                            unsigned erasure_code,
+                            struct write_point_specifier write_point,
+                            struct bch_devs_list *devs_have,
+                            unsigned nr_replicas,
+                            unsigned nr_replicas_required,
+                            enum alloc_reserve reserve,
+                            unsigned flags,
+                            struct closure *cl,
+                            struct write_point **wp_ret)
 {
        struct bch_fs *c = trans->c;
        struct write_point *wp;
@@ -1336,3 +1366,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
                spin_unlock(&ob->lock);
        }
 }
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)   #n,
+       WRITE_POINT_STATES()
+#undef x
+       NULL
+};
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct write_point *wp;
+       unsigned i;
+
+       for (wp = c->write_points;
+            wp < c->write_points + ARRAY_SIZE(c->write_points);
+            wp++) {
+               prt_printf(out, "%lu: ", wp->write_point);
+               prt_human_readable_u64(out, wp->sectors_allocated);
+
+               prt_printf(out, " last wrote: ");
+               bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+               for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+                       prt_printf(out, " %s: ", bch2_write_point_states[i]);
+                       bch2_pr_time_units(out, wp->time[i]);
+               }
+
+               prt_newline(out);
+       }
+}
index 62fbf1c78ea9592708d9ec96e97205ad3ce6f6cd..26e986f2385b902f50ee97125b9339ec6a17d65f 100644 (file)
@@ -16,6 +16,8 @@ struct bch_devs_List;
 
 extern const char * const bch2_alloc_reserves[];
 
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
 struct dev_alloc_list {
        unsigned        nr;
        u8              devs[BCH_SB_MEMBERS_MAX];
@@ -178,7 +180,8 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
        unsigned i;
 
        BUG_ON(sectors > wp->sectors_free);
-       wp->sectors_free -= sectors;
+       wp->sectors_free        -= sectors;
+       wp->sectors_allocated   += sectors;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
@@ -219,4 +222,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
index 330267346c639fda85f8fa8ea1f0c2e07f35217f..2e6f4806925871bf513f17235e6680e067e4471f 100644 (file)
@@ -9,7 +9,6 @@
 #include "fifo.h"
 
 struct bucket_alloc_state {
-       u64     cur_bucket;
        u64     buckets_seen;
        u64     skipped_open;
        u64     skipped_need_journal_commit;
@@ -75,6 +74,19 @@ struct dev_stripe_state {
        u64                     next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
+#define WRITE_POINT_STATES()           \
+       x(stopped)                      \
+       x(waiting_io)                   \
+       x(waiting_work)                 \
+       x(running)
+
+enum write_point_state {
+#define x(n)   WRITE_POINT_##n,
+       WRITE_POINT_STATES()
+#undef x
+       WRITE_POINT_STATE_NR
+};
+
 struct write_point {
        struct {
                struct hlist_node       node;
@@ -88,6 +100,8 @@ struct write_point {
 
                struct open_buckets     ptrs;
                struct dev_stripe_state stripe;
+
+               u64                     sectors_allocated;
        } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
        struct {
@@ -95,6 +109,10 @@ struct write_point {
 
                struct list_head        writes;
                spinlock_t              writes_lock;
+
+               enum write_point_state  state;
+               u64                     last_state_change;
+               u64                     time[WRITE_POINT_STATE_NR];
        } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 };
 
index 405823d1cfabc69499d853ccff87fca78d9df66a..0f8ffdf45ca809f45c3d5157e465353988d37705 100644 (file)
@@ -69,7 +69,7 @@ static bool extent_matches_bp(struct bch_fs *c,
 }
 
 int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                            int rw, struct printbuf *err)
+                            unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
        struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
index 48a48b75c0ac137b22f566568024ebb8525c241b..ac7b09321aabee73a3b8d7349b3fe3ebebc0866a 100644 (file)
@@ -6,7 +6,7 @@
 #include "super.h"
 
 int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
-                            int, struct printbuf *);
+                            unsigned, struct printbuf *);
 void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
 void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
index febef9ac254ba26b432f394c40ef50f060637559..f513173375803718021a6e65afebcafcae73d790 100644 (file)
 #include "opts.h"
 #include "util.h"
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
 #define dynamic_fault(...)             0
 #define race_fault(...)                        0
 
@@ -503,7 +507,7 @@ struct bch_dev {
 
        /* Allocator: */
        u64                     new_fs_bucket_idx;
-       u64                     bucket_alloc_trans_early_cursor;
+       u64                     alloc_cursor;
 
        unsigned                nr_open_buckets;
        unsigned                nr_btree_reserve;
@@ -524,7 +528,7 @@ struct bch_dev {
 
        /* The rest of this all shows up in sysfs */
        atomic64_t              cur_latency[2];
-       struct time_stats       io_latency[2];
+       struct bch2_time_stats  io_latency[2];
 
 #define CONGESTED_MAX          1024
        atomic_t                congested;
@@ -543,6 +547,7 @@ enum {
        /* shutdown: */
        BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
+       BCH_FS_GOING_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
        BCH_FS_CLEAN_SHUTDOWN,
 
@@ -573,8 +578,8 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
+       struct bch2_time_stats  lock_hold_times;
        struct mutex            lock;
-       struct time_stats       lock_hold_times;
        unsigned                nr_max_paths;
        unsigned                max_mem;
        char                    *max_paths_text;
@@ -634,6 +639,29 @@ typedef struct {
 #define BCACHEFS_ROOT_SUBVOL_INUM                                      \
        ((subvol_inum) { BCACHEFS_ROOT_SUBVOL,  BCACHEFS_ROOT_INO })
 
+#define BCH_WRITE_REFS()                                               \
+       x(trans)                                                        \
+       x(write)                                                        \
+       x(promote)                                                      \
+       x(node_rewrite)                                                 \
+       x(stripe_create)                                                \
+       x(stripe_delete)                                                \
+       x(reflink)                                                      \
+       x(fallocate)                                                    \
+       x(discard)                                                      \
+       x(invalidate)                                                   \
+       x(move)                                                         \
+       x(delete_dead_snapshots)                                        \
+       x(snapshot_delete_pagecache)                                    \
+       x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+       BCH_WRITE_REFS()
+#undef x
+       BCH_WRITE_REF_NR,
+};
+
 struct bch_fs {
        struct closure          cl;
 
@@ -655,7 +683,11 @@ struct bch_fs {
        struct rw_semaphore     state_lock;
 
        /* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+       atomic_long_t           writes[BCH_WRITE_REF_NR];
+#else
        struct percpu_ref       writes;
+#endif
        struct work_struct      read_only_work;
 
        struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
@@ -857,6 +889,7 @@ struct bch_fs {
        struct mutex            gc_gens_lock;
 
        /* IO PATH */
+       struct semaphore        io_in_flight;
        struct bio_set          bio_read;
        struct bio_set          bio_read_split;
        struct bio_set          bio_write;
@@ -969,11 +1002,51 @@ struct bch_fs {
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
-       struct time_stats       times[BCH_TIME_STAT_NR];
+       struct bch2_time_stats  times[BCH_TIME_STAT_NR];
 
        struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 };
 
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+       atomic_long_inc(&c->writes[ref]);
+#else
+       percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+       return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+               atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+       return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+       long v = atomic_long_dec_return(&c->writes[ref]);
+
+       BUG_ON(v < 0);
+       if (v)
+               return;
+       for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+               if (atomic_long_read(&c->writes[i]))
+                       return;
+
+       set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+       wake_up(&bch2_read_only_wait);
+#else
+       percpu_ref_put(&c->writes);
+#endif
+}
+
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 {
 #ifndef NO_BCACHEFS_FS
index 48438e67646de8fbfe2182a7fc6abfca2fa3689c..ffd913733e97c37d581e7a84086bdd46252856e2 100644 (file)
@@ -1357,7 +1357,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
        struct bch_sb_field     field;
-       struct bch_replicas_entry entries[0];
+       struct bch_replicas_entry entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -1436,7 +1436,7 @@ struct bch_sb_field_disk_groups {
        x(move_extent_read,                             35)     \
        x(move_extent_write,                            36)     \
        x(move_extent_finish,                           37)     \
-       x(move_extent_race,                             38)     \
+       x(move_extent_fail,                             38)     \
        x(move_extent_alloc_mem_fail,                   39)     \
        x(copygc,                                       40)     \
        x(copygc_wait,                                  41)     \
@@ -1705,7 +1705,6 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-/* Obsolete, always enabled: */
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,             struct bch_sb, flags[4], 33, 34);
 
index e13ce07fa76ce2f2d408dd67dd89f87358cd255b..72d95831d65df7a752dfc5a2ca0e084c9d2d141b 100644 (file)
@@ -24,7 +24,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                              int rw, struct printbuf *err)
+                              unsigned flags, struct printbuf *err)
 {
        return 0;
 }
@@ -38,7 +38,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                int rw, struct printbuf *err)
+                                unsigned flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k)) {
                prt_printf(err, "incorrect value size (%zu != 0)",
@@ -54,7 +54,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                  int rw, struct printbuf *err)
+                                  unsigned flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
                prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -74,7 +74,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                       int rw, struct printbuf *err)
+                                       unsigned flags, struct printbuf *err)
 {
        return 0;
 }
@@ -95,7 +95,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                               int rw, struct printbuf *err)
+                               unsigned flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k)) {
                prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -124,14 +124,14 @@ const struct bkey_ops bch2_bkey_ops[] = {
 };
 
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        if (k.k->type >= KEY_TYPE_MAX) {
                prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
                return -BCH_ERR_invalid_bkey;
        }
 
-       return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
+       return bch2_bkey_ops[k.k->type].key_invalid(c, k, flags, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
@@ -207,7 +207,7 @@ static unsigned bch2_key_types_allowed[] = {
 
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                        enum btree_node_type type,
-                       int rw, struct printbuf *err)
+                       unsigned flags, struct printbuf *err)
 {
        if (k.k->u64s < BKEY_U64s) {
                prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
@@ -216,7 +216,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
        if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
                prt_printf(err, "invalid key type for btree %s (%s)",
-                          bch2_btree_ids[type], bch2_bkey_types[type]);
+                          bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
                return -BCH_ERR_invalid_bkey;
        }
 
@@ -263,10 +263,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                      enum btree_node_type type,
-                     int rw, struct printbuf *err)
+                     unsigned flags, struct printbuf *err)
 {
-       return __bch2_bkey_invalid(c, k, type, rw, err) ?:
-               bch2_bkey_val_invalid(c, k, rw, err);
+       return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+               bch2_bkey_val_invalid(c, k, flags, err);
 }
 
 int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
@@ -374,7 +374,11 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
        const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
 
-       return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
+       return bch2_bkey_maybe_mergable(l.k, r.k) &&
+               (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+               bch2_bkey_ops[l.k->type].key_merge &&
+               !bch2_key_merging_disabled &&
+               ops->key_merge(c, l, r);
 }
 
 static const struct old_bkey_type {
index 2cbb0f39161bb2f77c8152ce362f4286ce1dd21c..9a6afab87f6c6c6ff2918e6d0dca59b7b218b8b7 100644 (file)
@@ -21,7 +21,7 @@ extern const char * const bch2_bkey_types[];
  */
 struct bkey_ops {
        int             (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
-                                      int rw, struct printbuf *err);
+                                      unsigned flags, struct printbuf *err);
        void            (*val_to_text)(struct printbuf *, struct bch_fs *,
                                       struct bkey_s_c);
        void            (*swab)(struct bkey_s);
@@ -38,11 +38,13 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+#define BKEY_INVALID_FROM_JOURNAL              (1 << 1)
+
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-                       enum btree_node_type, int, struct printbuf *);
+                       enum btree_node_type, unsigned, struct printbuf *);
 int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-                     enum btree_node_type, int, struct printbuf *);
+                     enum btree_node_type, unsigned, struct printbuf *);
 int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
@@ -60,10 +62,7 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 {
        return l->type == r->type &&
                !bversion_cmp(l->version, r->version) &&
-               bpos_eq(l->p, bkey_start_pos(r)) &&
-               (u64) l->size + r->size <= KEY_SIZE_MAX &&
-               bch2_bkey_ops[l->type].key_merge &&
-               !bch2_key_merging_disabled;
+               bpos_eq(l->p, bkey_start_pos(r));
 }
 
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -82,7 +81,9 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+       __BTREE_UPDATE_NOJOURNAL,
        __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+       __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
@@ -95,7 +96,10 @@ enum btree_update_flags {
 };
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL         (1U << __BTREE_UPDATE_NOJOURNAL)
 #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY    \
+       (1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
 
index 544e2dfb3c377cc232d4c5b43d892e4760df5395..89478fc57411c95effcd339bf3049d9000123358 100644 (file)
@@ -36,16 +36,7 @@ static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
 
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
-       unsigned offset = __btree_node_key_to_offset(b, k);
-       struct bset_tree *t;
-
-       for_each_bset(b, t)
-               if (offset <= t->end_offset) {
-                       EBUG_ON(offset < btree_bkey_first_offset(t));
-                       return t;
-               }
-
-       BUG();
+       return bch2_bkey_to_bset_inlined(b, k);
 }
 
 /*
index acef143091d045cc24bd930bd370bc5a2783ad73..fd2915a150708f9b9e5203d761c2ea2342e872ef 100644 (file)
@@ -291,6 +291,21 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
        return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+       unsigned offset = __btree_node_key_to_offset(b, k);
+       struct bset_tree *t;
+
+       for_each_bset(b, t)
+               if (offset <= t->end_offset) {
+                       EBUG_ON(offset < btree_bkey_first_offset(t));
+                       return t;
+               }
+
+       BUG();
+}
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
 
 struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
index b5e78042c1ff9bb46e1294e242e482be6804beab..d10257e1a0bdc90ed24fc2061ca1e2f8f851c8f3 100644 (file)
@@ -12,6 +12,7 @@
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 #include <trace/events/bcachefs.h>
 
 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
@@ -427,12 +428,16 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
        return btree_cache_can_free(bc);
 }
 
-static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
        struct bch_fs *c = container_of(shrink, struct bch_fs,
                                        btree_cache.shrink);
+       char *cbuf;
+       size_t buflen = seq_buf_get_buf(s, &cbuf);
+       struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
 
-       bch2_btree_cache_to_text(out, &c->btree_cache);
+       bch2_btree_cache_to_text(&out, &c->btree_cache);
+       seq_buf_commit(s, out.pos);
 }
 
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
@@ -1090,7 +1095,7 @@ retry:
                        goto out;
        } else {
 lock_node:
-               ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+               ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        return ERR_PTR(ret);
 
index 68796e197e412c8a6bcd284a9ef928780e0fc822..0145746c277b948b82de7b34513576dcc450b80e 100644 (file)
@@ -526,11 +526,10 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
                          struct btree *b, struct bset *i,
                          unsigned offset, int write)
 {
-       prt_printf(out, bch2_log_msg(c, ""));
-       if (!write)
-               prt_str(out, "error validating btree node ");
-       else
-               prt_str(out, "corrupt btree node before write ");
+       prt_printf(out, bch2_log_msg(c, "%s"),
+                  write == READ
+                  ? "error validating btree node "
+                  : "corrupt btree node before write ");
        if (ca)
                prt_printf(out, "on %s ", ca->name);
        prt_printf(out, "at btree ");
@@ -543,63 +542,96 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 }
 
 enum btree_err_type {
+       /*
+        * We can repair this locally, and we're after the checksum check so
+        * there's no need to try another replica:
+        */
        BTREE_ERR_FIXABLE,
+       /*
+        * We can repair this if we have to, but we should try reading another
+        * replica if we can:
+        */
        BTREE_ERR_WANT_RETRY,
+       /*
+        * Read another replica if we have one, otherwise consider the whole
+        * node bad:
+        */
        BTREE_ERR_MUST_RETRY,
-       BTREE_ERR_FATAL,
+       BTREE_ERR_BAD_NODE,
+       BTREE_ERR_INCOMPATIBLE,
 };
 
 enum btree_validate_ret {
        BTREE_RETRY_READ = 64,
 };
 
+static int __btree_err(enum btree_err_type type,
+                      struct bch_fs *c,
+                      struct bch_dev *ca,
+                      struct btree *b,
+                      struct bset *i,
+                      int write,
+                      bool have_retry,
+                      const char *fmt, ...)
+{
+       struct printbuf out = PRINTBUF;
+       va_list args;
+       int ret = -BCH_ERR_fsck_fix;
+
+       btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+       va_start(args, fmt);
+       prt_vprintf(&out, fmt, args);
+       va_end(args);
+
+       if (write == WRITE) {
+               bch2_print_string_as_lines(KERN_ERR, out.buf);
+               ret = c->opts.errors == BCH_ON_ERROR_continue
+                       ? 0
+                       : -BCH_ERR_fsck_errors_not_fixed;
+               goto out;
+       }
+
+       if (!have_retry && type == BTREE_ERR_WANT_RETRY)
+               type = BTREE_ERR_FIXABLE;
+       if (!have_retry && type == BTREE_ERR_MUST_RETRY)
+               type = BTREE_ERR_BAD_NODE;
+
+       switch (type) {
+       case BTREE_ERR_FIXABLE:
+               mustfix_fsck_err(c, "%s", out.buf);
+               ret = -BCH_ERR_fsck_fix;
+               break;
+       case BTREE_ERR_WANT_RETRY:
+       case BTREE_ERR_MUST_RETRY:
+               bch2_print_string_as_lines(KERN_ERR, out.buf);
+               ret = BTREE_RETRY_READ;
+               break;
+       case BTREE_ERR_BAD_NODE:
+               bch2_print_string_as_lines(KERN_ERR, out.buf);
+               bch2_topology_error(c);
+               ret = -BCH_ERR_need_topology_repair;
+               break;
+       case BTREE_ERR_INCOMPATIBLE:
+               bch2_print_string_as_lines(KERN_ERR, out.buf);
+               ret = -BCH_ERR_fsck_errors_not_fixed;
+               break;
+       default:
+               BUG();
+       }
+out:
+fsck_err:
+       printbuf_exit(&out);
+       return ret;
+}
+
 #define btree_err(type, c, ca, b, i, msg, ...)                         \
 ({                                                                     \
-       __label__ out;                                                  \
-       struct printbuf out = PRINTBUF;                                 \
-                                                                       \
-       btree_err_msg(&out, c, ca, b, i, b->written, write);            \
-       prt_printf(&out, msg, ##__VA_ARGS__);                           \
+       int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
                                                                        \
-       if (type == BTREE_ERR_FIXABLE &&                                \
-           write == READ &&                                            \
-           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
-               mustfix_fsck_err(c, "%s", out.buf);                     \
-               goto out;                                               \
-       }                                                               \
-                                                                       \
-       bch2_print_string_as_lines(KERN_ERR, out.buf);                  \
-                                                                       \
-       switch (write) {                                                \
-       case READ:                                                      \
-               switch (type) {                                         \
-               case BTREE_ERR_FIXABLE:                                 \
-                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
-                       goto fsck_err;                                  \
-               case BTREE_ERR_WANT_RETRY:                              \
-                       if (have_retry) {                               \
-                               ret = BTREE_RETRY_READ;                 \
-                               goto fsck_err;                          \
-                       }                                               \
-                       break;                                          \
-               case BTREE_ERR_MUST_RETRY:                              \
-                       ret = BTREE_RETRY_READ;                         \
-                       goto fsck_err;                                  \
-               case BTREE_ERR_FATAL:                                   \
-                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
-                       goto fsck_err;                                  \
-               }                                                       \
-               break;                                                  \
-       case WRITE:                                                     \
-               if (bch2_fs_inconsistent(c)) {                          \
-                       ret = -BCH_ERR_fsck_errors_not_fixed;           \
-                       goto fsck_err;                                  \
-               }                                                       \
-               break;                                                  \
-       }                                                               \
-out:                                                                   \
-       printbuf_exit(&out);                                            \
-       true;                                                           \
+       if (_ret != -BCH_ERR_fsck_fix)                                  \
+               goto fsck_err;                                          \
+       *saw_error = true;                                              \
 })
 
 #define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
@@ -608,6 +640,7 @@ out:                                                                        \
  * When btree topology repair changes the start or end of a node, that might
  * mean we have to drop keys that are no longer inside the node:
  */
+__cold
 void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
        struct bset_tree *t;
@@ -658,7 +691,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         struct btree *b, struct bset *i,
                         unsigned offset, unsigned sectors,
-                        int write, bool have_retry)
+                        int write, bool have_retry, bool *saw_error)
 {
        unsigned version = le16_to_cpu(i->version);
        const char *err;
@@ -669,7 +702,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        btree_err_on((version != BCH_BSET_VERSION_OLD &&
                      version < bcachefs_metadata_version_min) ||
                     version >= bcachefs_metadata_version_max,
-                    BTREE_ERR_FATAL, c, ca, b, i,
+                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
                     "unsupported bset version");
 
        if (btree_err_on(version < c->sb.version_min,
@@ -693,7 +726,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        }
 
        btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-                    BTREE_ERR_FATAL, c, ca, b, i,
+                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
                     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
        if (btree_err_on(offset + sectors > btree_sectors(c),
@@ -770,7 +803,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
                err = bch2_bkey_format_validate(&bn->format);
                btree_err_on(err,
-                            BTREE_ERR_FATAL, c, ca, b, i,
+                            BTREE_ERR_BAD_NODE, c, ca, b, i,
                             "invalid bkey format: %s", err);
 
                compat_bformat(b->c.level, b->c.btree_id, version,
@@ -795,7 +828,8 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-                        struct bset *i, int write, bool have_retry)
+                        struct bset *i, int write,
+                        bool have_retry, bool *saw_error)
 {
        unsigned version = le16_to_cpu(i->version);
        struct bkey_packed *k, *prev = NULL;
@@ -882,7 +916,7 @@ fsck_err:
 }
 
 int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-                             struct btree *b, bool have_retry)
+                             struct btree *b, bool have_retry, bool *saw_error)
 {
        struct btree_node_entry *bne;
        struct sort_iter *iter;
@@ -897,7 +931,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        unsigned blacklisted_written, nonblacklisted_written = 0;
        unsigned ptr_written = btree_ptr_sectors_written(&b->key);
        struct printbuf buf = PRINTBUF;
-       int ret, retry_read = 0, write = READ;
+       int ret = 0, retry_read = 0, write = READ;
 
        b->version_ondisk = U16_MAX;
        /* We might get called multiple times on read retry: */
@@ -958,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
                        btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
                                     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-                                    BTREE_ERR_FATAL, c, NULL, b, NULL,
+                                    BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
                                     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
                        sectors = vstruct_sectors(b->data, c->block_bits);
@@ -993,14 +1027,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                        le16_to_cpu(i->version));
 
                ret = validate_bset(c, ca, b, i, b->written, sectors,
-                                   READ, have_retry);
+                                   READ, have_retry, saw_error);
                if (ret)
                        goto fsck_err;
 
                if (!b->written)
                        btree_node_set_format(b, b->data->format);
 
-               ret = validate_bset_keys(c, b, i, READ, have_retry);
+               ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
                if (ret)
                        goto fsck_err;
 
@@ -1140,12 +1174,10 @@ out:
        printbuf_exit(&buf);
        return retry_read;
 fsck_err:
-       if (ret == BTREE_RETRY_READ) {
+       if (ret == BTREE_RETRY_READ)
                retry_read = 1;
-       } else {
-               bch2_inconsistent_error(c);
+       else
                set_btree_node_read_error(b);
-       }
        goto out;
 }
 
@@ -1195,7 +1227,7 @@ start:
                                &failed, &rb->pick) > 0;
 
                if (!bio->bi_status &&
-                   !bch2_btree_node_read_done(c, ca, b, can_retry)) {
+                   !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
                        if (retry)
                                bch_info(c, "retry success");
                        break;
@@ -1301,6 +1333,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
        unsigned i, written = 0, written2 = 0;
        __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
                ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+       bool _saw_error = false, *saw_error = &_saw_error;
 
        for (i = 0; i < ra->nr; i++) {
                struct btree_node *bn = ra->buf[i];
@@ -1387,13 +1420,15 @@ fsck_err:
 
        if (best >= 0) {
                memcpy(b->data, ra->buf[best], btree_bytes(c));
-               ret = bch2_btree_node_read_done(c, NULL, b, false);
+               ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
        } else {
                ret = -1;
        }
 
        if (ret)
                set_btree_node_read_error(b);
+       else if (*saw_error)
+               bch2_btree_node_rewrite_async(c, b);
 
        for (i = 0; i < ra->nr; i++) {
                mempool_free(ra->buf[i], &c->btree_bounce_pool);
@@ -1770,6 +1805,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
                                   struct bset *i, unsigned sectors)
 {
        struct printbuf buf = PRINTBUF;
+       bool saw_error;
        int ret;
 
        ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
@@ -1781,8 +1817,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        if (ret)
                return ret;
 
-       ret = validate_bset_keys(c, b, i, WRITE, false) ?:
-               validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
+       ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+               validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
        if (ret) {
                bch2_inconsistent_error(c);
                dump_stack();
index a720dd74139b5766d4c439abc739e8e3ec13112e..c43fb60b8c82c2ec4f066ec0ff752e4a4baf7401 100644 (file)
@@ -129,7 +129,7 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
-                             struct btree *, bool);
+                             struct btree *, bool, bool *);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
                         const struct bkey_i *, unsigned);
index 9c139a7b43061436982bd23bc579f5fc80a2ce5c..077d72bfa6826088da1be6ee27a95a7177a388df 100644 (file)
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
-static void btree_trans_verify_sorted(struct btree_trans *);
-inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
-                                                  struct btree_path *, int);
-
 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
                                       struct btree_path *);
 
 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
        return iter->ip_allocated;
 #else
        return 0;
@@ -353,6 +348,8 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
        unsigned idx;
        struct printbuf buf = PRINTBUF;
 
+       btree_trans_sort_paths(trans);
+
        trans_for_each_path_inorder(trans, path, idx) {
                int cmp = cmp_int(path->btree_id, id) ?:
                        cmp_int(path->cached, key_cache);
@@ -540,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
                              unsigned clobber_u64s,
                              unsigned new_u64s)
 {
-       struct bset_tree *t = bch2_bkey_to_bset(b, where);
+       struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
        struct btree_path *linked;
 
        if (node_iter != &path->l[b->c.level].iter) {
@@ -595,6 +592,7 @@ static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
        path->pos = k.k ? k.k->p : l->b->key.k.p;
+       trans->paths_sorted = false;
        bch2_btree_path_verify_level(trans, path, l - path->l);
        return k;
 }
@@ -608,6 +606,7 @@ static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
 
        path->pos = k.k ? k.k->p : l->b->data->min_key;
+       trans->paths_sorted = false;
        bch2_btree_path_verify_level(trans, path, l - path->l);
        return k;
 }
@@ -963,15 +962,13 @@ err:
        return ret;
 }
 
-static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
-                                  unsigned, unsigned long);
 
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct btree_path *path;
        unsigned long trace_ip = _RET_IP_;
-       int ret = 0;
+       int i, ret = 0;
 
        if (trans->in_traverse_all)
                return -BCH_ERR_transaction_restart_in_traverse_all;
@@ -979,12 +976,11 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
        trans->in_traverse_all = true;
 retry_all:
        trans->restarted = 0;
-       trans->traverse_all_idx = U8_MAX;
 
        trans_for_each_path(trans, path)
                path->should_be_locked = false;
 
-       btree_trans_verify_sorted(trans);
+       btree_trans_sort_paths(trans);
 
        bch2_trans_unlock(trans);
        cond_resched();
@@ -1001,34 +997,35 @@ retry_all:
        }
 
        /* Now, redo traversals in correct order: */
-       trans->traverse_all_idx = 0;
-       while (trans->traverse_all_idx < trans->nr_sorted) {
-               path = trans->paths + trans->sorted[trans->traverse_all_idx];
+       i = 0;
+       while (i < trans->nr_sorted) {
+               path = trans->paths + trans->sorted[i];
 
                /*
                 * Traversing a path can cause another path to be added at about
                 * the same position:
                 */
                if (path->uptodate) {
-                       ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+                       __btree_path_get(path, false);
+                       ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+                       __btree_path_put(path, false);
+
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
                            ret == -ENOMEM)
                                goto retry_all;
                        if (ret)
                                goto err;
-                       BUG_ON(path->uptodate);
                } else {
-                       trans->traverse_all_idx++;
+                       i++;
                }
        }
 
        /*
-        * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
-        * and relock(), relock() won't relock since path->should_be_locked
-        * isn't set yet, which is all fine
+        * We used to assert that all paths had been traversed here
+        * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+        * path->Should_be_locked is not set yet, we we might have unlocked and
+        * then failed to relock a path - that's fine.
         */
-       trans_for_each_path(trans, path)
-               BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 err:
        bch2_btree_cache_cannibalize_unlock(c);
 
@@ -1115,10 +1112,10 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_path_traverse_one(struct btree_trans *trans,
-                                  struct btree_path *path,
-                                  unsigned flags,
-                                  unsigned long trace_ip)
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+                                struct btree_path *path,
+                                unsigned flags,
+                                unsigned long trace_ip)
 {
        unsigned depth_want = path->level;
        int ret = -((int) trans->restarted);
@@ -1177,31 +1174,14 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
        path->uptodate = BTREE_ITER_UPTODATE;
 out:
-       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+               panic("ret %s (%i) trans->restarted %s (%i)\n",
+                     bch2_err_str(ret), ret,
+                     bch2_err_str(trans->restarted), trans->restarted);
        bch2_btree_path_verify(trans, path);
        return ret;
 }
 
-int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
-                                         struct btree_path *path, unsigned flags)
-{
-       if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-               unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
-               u64 mask = ~(~0ULL << restart_probability_bits);
-
-               if ((prandom_u32() & mask) == mask) {
-                       trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
-                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
-               }
-       }
-
-       if (path->uptodate < BTREE_ITER_NEED_RELOCK)
-               return 0;
-
-       return  bch2_trans_cond_resched(trans) ?:
-               btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
                            struct btree_path *src)
 {
@@ -1237,10 +1217,6 @@ struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
        __btree_path_put(path, intent);
        path = btree_path_clone(trans, path, intent);
        path->preserve = false;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       path->ip_allocated = ip;
-#endif
-       btree_trans_verify_sorted(trans);
        return path;
 }
 
@@ -1251,14 +1227,13 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 {
        unsigned level = path->level;
 
-       EBUG_ON(trans->restarted);
+       bch2_trans_verify_not_in_restart(trans);
        EBUG_ON(!path->ref);
 
        path = bch2_btree_path_make_mut(trans, path, intent, ip);
 
-       path->pos = new_pos;
-
-       bch2_btree_path_check_sort_fast(trans, path, cmp);
+       path->pos               = new_pos;
+       trans->paths_sorted     = false;
 
        if (unlikely(path->cached)) {
                btree_node_unlock(trans, path, 0);
@@ -1381,6 +1356,21 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
        __bch2_path_free(trans, path);
 }
 
+void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+       panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+             trans->restart_count, restart_count,
+             (void *) trans->last_restarted_ip);
+}
+
+void bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+       panic("in transaction restart: %s, last restarted by %pS\n",
+             bch2_err_str(trans->restarted),
+             (void *) trans->last_restarted_ip);
+}
+
+noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
        struct btree_insert_entry *i;
@@ -1421,6 +1411,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
        printbuf_exit(&buf);
 }
 
+noinline __cold
 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
 {
        prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
@@ -1432,39 +1423,59 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
        bch2_bpos_to_text(out, path->pos);
 
        prt_printf(out, " locks %u", path->nodes_locked);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
        prt_printf(out, " %pS", (void *) path->ip_allocated);
 #endif
        prt_newline(out);
 }
 
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+                               bool nosort)
 {
        struct btree_path *path;
        unsigned idx;
 
+       if (!nosort)
+               btree_trans_sort_paths(trans);
+
        trans_for_each_path_inorder(trans, path, idx)
                bch2_btree_path_to_text(out, path);
 }
 
 noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+       __bch2_trans_paths_to_text(out, trans, false);
+}
+
+noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 {
        struct printbuf buf = PRINTBUF;
 
-       bch2_trans_paths_to_text(&buf, trans);
+       __bch2_trans_paths_to_text(&buf, trans, nosort);
        bch2_trans_updates_to_text(&buf, trans);
 
        bch2_print_string_as_lines(KERN_ERR, buf.buf);
        printbuf_exit(&buf);
 }
 
-noinline
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+       __bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
 static void bch2_trans_update_max_paths(struct btree_trans *trans)
 {
        struct btree_transaction_stats *s = btree_trans_stats(trans);
        struct printbuf buf = PRINTBUF;
 
+       if (!s)
+               return;
+
        bch2_trans_paths_to_text(&buf, trans);
 
        if (!buf.allocation_failure) {
@@ -1478,6 +1489,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
        }
 
        printbuf_exit(&buf);
+
+       trans->nr_max_paths = hweight64(trans->paths_allocated);
 }
 
 static noinline void btree_path_overflow(struct btree_trans *trans)
@@ -1497,19 +1510,24 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
                btree_path_overflow(trans);
 
        idx = __ffs64(~trans->paths_allocated);
-       trans->paths_allocated |= 1ULL << idx;
 
+       /*
+        * Do this before marking the new path as allocated, since it won't be
+        * initialized yet:
+        */
        if (unlikely(idx > trans->nr_max_paths))
                bch2_trans_update_max_paths(trans);
 
-       path = &trans->paths[idx];
+       trans->paths_allocated |= 1ULL << idx;
 
+       path = &trans->paths[idx];
        path->idx               = idx;
        path->ref               = 0;
        path->intent_ref        = 0;
        path->nodes_locked      = 0;
 
        btree_path_list_add(trans, pos, path);
+       trans->paths_sorted = false;
        return path;
 }
 
@@ -1523,10 +1541,11 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
        bool intent = flags & BTREE_ITER_INTENT;
        int i;
 
-       EBUG_ON(trans->restarted);
-       btree_trans_verify_sorted(trans);
+       bch2_trans_verify_not_in_restart(trans);
        bch2_trans_verify_locks(trans);
 
+       btree_trans_sort_paths(trans);
+
        trans_for_each_path_inorder(trans, path, i) {
                if (__btree_path_cmp(path,
                                     btree_id,
@@ -1559,10 +1578,10 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
                path->nodes_locked              = 0;
                for (i = 0; i < ARRAY_SIZE(path->l); i++)
                        path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
                path->ip_allocated              = ip;
 #endif
-               btree_trans_verify_sorted(trans);
+               trans->paths_sorted             = false;
        }
 
        if (!(flags & BTREE_ITER_NOPRESERVE))
@@ -1613,7 +1632,8 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
                EBUG_ON(ck &&
                        (path->btree_id != ck->key.btree_id ||
                         !bkey_eq(path->pos, ck->key.pos)));
-               EBUG_ON(!ck || !ck->valid);
+               if (!ck || !ck->valid)
+                       return bkey_s_c_null;
 
                *u = ck->k->k;
                k = bkey_i_to_s_c(ck->k);
@@ -1697,7 +1717,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        struct btree *b = NULL;
        int ret;
 
-       BUG_ON(trans->restarted);
+       bch2_trans_verify_not_in_restart(trans);
        EBUG_ON(iter->path->cached);
        bch2_btree_iter_verify(iter);
 
@@ -1798,19 +1818,18 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
        return ret;
 }
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
-                                                     enum btree_id btree_id,
-                                                     struct bpos pos)
+static noinline
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 {
        struct btree_insert_entry *i;
        struct bkey_i *ret = NULL;
 
-       trans_for_each_update(trans, i) {
-               if (i->btree_id < btree_id)
+       trans_for_each_update(iter->trans, i) {
+               if (i->btree_id < iter->btree_id)
                        continue;
-               if (i->btree_id > btree_id)
+               if (i->btree_id > iter->btree_id)
                        break;
-               if (bpos_lt(i->k->k.p, pos))
+               if (bpos_lt(i->k->k.p, iter->path->pos))
                        continue;
                if (i->key_cache_already_flushed)
                        continue;
@@ -1821,30 +1840,44 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
        return ret;
 }
 
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+       return iter->flags & BTREE_ITER_WITH_UPDATES
+               ? __bch2_btree_trans_peek_updates(iter)
+               : NULL;
+}
+
 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
                                       struct btree_iter *iter,
-                                      struct bpos start_pos,
                                       struct bpos end_pos)
 {
        struct bkey_i *k;
 
-       if (bpos_lt(start_pos, iter->journal_pos))
+       if (bpos_lt(iter->path->pos, iter->journal_pos))
                iter->journal_idx = 0;
 
        k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
                                        iter->path->level,
-                                       start_pos, end_pos,
+                                       iter->path->pos,
+                                       end_pos,
                                        &iter->journal_idx);
 
        iter->journal_pos = k ? k->k.p : end_pos;
        return k;
 }
 
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           struct bpos pos)
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+                                             struct btree_iter *iter)
 {
-       return bch2_btree_journal_peek(trans, iter, pos, pos);
+       struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+
+       if (k) {
+               iter->k = k->k;
+               return bkey_i_to_s_c(k);
+       } else {
+               return bkey_s_c_null;
+       }
 }
 
 static noinline
@@ -1853,7 +1886,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
                                         struct bkey_s_c k)
 {
        struct bkey_i *next_journal =
-               bch2_btree_journal_peek(trans, iter, iter->path->pos,
+               bch2_btree_journal_peek(trans, iter,
                                k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
 
        if (next_journal) {
@@ -1869,42 +1902,46 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
  * bkey_s_c_null:
  */
 static noinline
-struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
 {
        struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct bkey u;
+       struct bkey_s_c k;
        int ret;
 
+       if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+           bpos_eq(iter->pos, pos))
+               return bkey_s_c_null;
+
        if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
                return bkey_s_c_null;
 
        if (!iter->key_cache_path)
                iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
                                                     iter->flags & BTREE_ITER_INTENT, 0,
-                                                    iter->flags|BTREE_ITER_CACHED,
+                                                    iter->flags|BTREE_ITER_CACHED|
+                                                    BTREE_ITER_CACHED_NOFILL,
                                                     _THIS_IP_);
 
        iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
 
-       ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+       ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                        iter->flags|BTREE_ITER_CACHED) ?:
+               bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
        btree_path_set_should_be_locked(iter->key_cache_path);
 
-       return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
-{
-       struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
-       int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
-
-       return err ? bkey_s_c_err(err) : ret;
+       k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+       if (k.k && !bkey_err(k)) {
+               iter->k = u;
+               k.k = &iter->k;
+       }
+       return k;
 }
 
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
@@ -1959,9 +1996,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
                        k = btree_trans_peek_journal(trans, iter, k);
 
-               next_update = iter->flags & BTREE_ITER_WITH_UPDATES
-                       ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
-                       : NULL;
+               next_update = btree_trans_peek_updates(iter);
+
                if (next_update &&
                    bpos_le(next_update->k.p,
                            k.k ? k.k->p : l->b->key.k.p)) {
@@ -2114,8 +2150,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
        btree_path_set_should_be_locked(iter->path);
 out_no_locked:
        if (iter->update_path) {
-               if (iter->update_path->uptodate &&
-                   (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
+               ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+               if (unlikely(ret))
                        k = bkey_s_c_err(ret);
                else
                        btree_path_set_should_be_locked(iter->update_path);
@@ -2293,8 +2329,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                        k = btree_path_level_prev(trans, iter->path,
                                                  &iter->path->l[0], &iter->k);
 
-               bch2_btree_path_check_sort(trans, iter->path, 0);
-
                if (likely(k.k)) {
                        if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
                                if (k.k->p.snapshot == iter->snapshot)
@@ -2419,9 +2453,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
            !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
                struct bkey_i *next_update;
 
-               if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
-                   (next_update = btree_trans_peek_updates(trans,
-                                               iter->btree_id, search_key)) &&
+               if ((next_update = btree_trans_peek_updates(iter)) &&
                    bpos_eq(next_update->k.p, iter->pos)) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
@@ -2429,15 +2461,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                }
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
-                   (next_update = bch2_btree_journal_peek_slot(trans,
-                                       iter, iter->pos))) {
-                       iter->k = next_update->k;
-                       k = bkey_i_to_s_c(next_update);
+                   (k = btree_trans_peek_slot_journal(trans, iter)).k)
                        goto out;
-               }
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
-                   (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
+                   (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
                        if (!bkey_err(k))
                                iter->k = *k.k;
                        /* We're not returning a key from iter->path: */
@@ -2529,27 +2557,29 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 
 /* new transactional stuff: */
 
-static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
-                                               struct btree_path *path)
-{
-       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-       EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
-       EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-}
-
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
 #ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+       struct btree_path *path;
        unsigned i;
 
-       for (i = 0; i < trans->nr_sorted; i++)
-               btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
-#endif
+       BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+
+       trans_for_each_path(trans, path) {
+               BUG_ON(path->sorted_idx >= trans->nr_sorted);
+               BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+       }
+
+       for (i = 0; i < trans->nr_sorted; i++) {
+               unsigned idx = trans->sorted[i];
+
+               EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+               BUG_ON(trans->paths[idx].sorted_idx != i);
+       }
 }
 
 static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
        struct btree_path *path, *prev = NULL;
        unsigned i;
 
@@ -2558,80 +2588,54 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
 
        trans_for_each_path_inorder(trans, path, i) {
                if (prev && btree_path_cmp(prev, path) > 0) {
-                       bch2_dump_trans_paths_updates(trans);
+                       __bch2_dump_trans_paths_updates(trans, true);
                        panic("trans paths out of order!\n");
                }
                prev = path;
        }
-#endif
-}
-
-static inline void btree_path_swap(struct btree_trans *trans,
-                                  struct btree_path *l, struct btree_path *r)
-{
-       swap(l->sorted_idx, r->sorted_idx);
-       swap(trans->sorted[l->sorted_idx],
-            trans->sorted[r->sorted_idx]);
-
-       btree_path_verify_sorted_ref(trans, l);
-       btree_path_verify_sorted_ref(trans, r);
-}
-
-static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
-                                               struct btree_path *path, int sib)
-{
-       unsigned idx = (unsigned) path->sorted_idx + sib;
-
-       EBUG_ON(sib != -1 && sib != 1);
-
-       return idx < trans->nr_sorted
-               ? trans->paths + trans->sorted[idx]
-               : NULL;
 }
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
 
-static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
-                                                  struct btree_path *path,
-                                                  int cmp)
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
 {
-       struct btree_path *n;
-       int cmp2;
-
-       EBUG_ON(!cmp);
-
-       while ((n = sib_btree_path(trans, path, cmp)) &&
-              (cmp2 = btree_path_cmp(n, path)) &&
-              cmp2 != cmp)
-               btree_path_swap(trans, n, path);
-
-       btree_trans_verify_sorted(trans);
-}
+       int i, l = 0, r = trans->nr_sorted, inc = 1;
+       bool swapped;
 
-inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
-                                      int cmp)
-{
-       struct btree_path *n;
+       btree_trans_verify_sorted_refs(trans);
 
-       if (cmp <= 0) {
-               n = prev_btree_path(trans, path);
-               if (n && btree_path_cmp(n, path) > 0) {
-                       do {
-                               btree_path_swap(trans, n, path);
-                               n = prev_btree_path(trans, path);
-                       } while (n && btree_path_cmp(n, path) > 0);
+       if (trans->paths_sorted)
+               goto out;
 
-                       goto out;
+       /*
+        * Cocktail shaker sort: this is efficient because iterators will be
+        * mostly sorted.
+        */
+       do {
+               swapped = false;
+
+               for (i = inc > 0 ? l : r - 2;
+                    i + 1 < r && i >= l;
+                    i += inc) {
+                       if (btree_path_cmp(trans->paths + trans->sorted[i],
+                                          trans->paths + trans->sorted[i + 1]) > 0) {
+                               swap(trans->sorted[i], trans->sorted[i + 1]);
+                               trans->paths[trans->sorted[i]].sorted_idx = i;
+                               trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+                               swapped = true;
+                       }
                }
-       }
 
-       if (cmp >= 0) {
-               n = next_btree_path(trans, path);
-               if (n && btree_path_cmp(path, n) > 0) {
-                       do {
-                               btree_path_swap(trans, path, n);
-                               n = next_btree_path(trans, path);
-                       } while (n && btree_path_cmp(path, n) > 0);
-               }
-       }
+               if (inc > 0)
+                       --r;
+               else
+                       l++;
+               inc = -inc;
+       } while (swapped);
+
+       trans->paths_sorted = true;
 out:
        btree_trans_verify_sorted(trans);
 }
@@ -2642,15 +2646,18 @@ static inline void btree_path_list_remove(struct btree_trans *trans,
        unsigned i;
 
        EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       trans->nr_sorted--;
+       memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+                               trans->sorted + path->sorted_idx + 1,
+                               DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+#else
        array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-
+#endif
        for (i = path->sorted_idx; i < trans->nr_sorted; i++)
                trans->paths[trans->sorted[i]].sorted_idx = i;
 
        path->sorted_idx = U8_MAX;
-
-       btree_trans_verify_sorted_refs(trans);
 }
 
 static inline void btree_path_list_add(struct btree_trans *trans,
@@ -2659,16 +2666,17 @@ static inline void btree_path_list_add(struct btree_trans *trans,
 {
        unsigned i;
 
-       btree_trans_verify_sorted_refs(trans);
-
-       path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
-
-       if (unlikely(trans->in_traverse_all) &&
-           trans->traverse_all_idx != U8_MAX &&
-           trans->traverse_all_idx >= path->sorted_idx)
-               trans->traverse_all_idx++;
+       path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
 
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+                             trans->sorted + path->sorted_idx,
+                             DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+       trans->nr_sorted++;
+       trans->sorted[path->sorted_idx] = path->idx;
+#else
        array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+#endif
 
        for (i = path->sorted_idx; i < trans->nr_sorted; i++)
                trans->paths[trans->sorted[i]].sorted_idx = i;
@@ -2812,14 +2820,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        trans->restart_count++;
        trans->mem_top                  = 0;
 
-       if (trans->fs_usage_deltas) {
-               trans->fs_usage_deltas->used = 0;
-               memset((void *) trans->fs_usage_deltas +
-                      offsetof(struct replicas_delta_list, memset_start), 0,
-                      (void *) &trans->fs_usage_deltas->memset_end -
-                      (void *) &trans->fs_usage_deltas->memset_start);
-       }
-
        trans_for_each_path(trans, path) {
                path->should_be_locked = false;
 
@@ -2850,25 +2850,19 @@ u32 bch2_trans_begin(struct btree_trans *trans)
                bch2_trans_relock(trans);
        }
 
-       if (unlikely(time_after(jiffies, trans->srcu_lock_time + HZ)))
+       if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
                bch2_trans_reset_srcu_lock(trans);
 
        trans->last_restarted_ip = _RET_IP_;
-       if (trans->restarted)
+       if (trans->restarted) {
                bch2_btree_path_traverse_all(trans);
+               trans->notrace_relock_fail = false;
+       }
 
        trans->last_begin_time = local_clock();
        return trans->restart_count;
 }
 
-void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
-{
-       if (trans_was_restarted(trans, restart_count))
-               panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
-                     trans->restart_count, restart_count,
-                     (void *) trans->last_restarted_ip);
-}
-
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 {
        size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
@@ -2908,7 +2902,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
        __acquires(&c->btree_trans_barrier)
 {
        struct btree_transaction_stats *s;
-       struct btree_trans *pos;
 
        BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
 
@@ -2944,16 +2937,20 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
        trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        trans->srcu_lock_time   = jiffies;
 
-       mutex_lock(&c->btree_trans_lock);
-       list_for_each_entry(pos, &c->btree_trans_list, list) {
-               if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
-                       list_add_tail(&trans->list, &pos->list);
-                       goto list_add_done;
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+               struct btree_trans *pos;
+
+               mutex_lock(&c->btree_trans_lock);
+               list_for_each_entry(pos, &c->btree_trans_list, list) {
+                       if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+                               list_add_tail(&trans->list, &pos->list);
+                               goto list_add_done;
+                       }
                }
-       }
-       list_add_tail(&trans->list, &c->btree_trans_list);
+               list_add_tail(&trans->list, &c->btree_trans_list);
 list_add_done:
-       mutex_unlock(&c->btree_trans_lock);
+               mutex_unlock(&c->btree_trans_lock);
+       }
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -2998,9 +2995,11 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        check_btree_paths_leaked(trans);
 
-       mutex_lock(&c->btree_trans_lock);
-       list_del(&trans->list);
-       mutex_unlock(&c->btree_trans_lock);
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+               mutex_lock(&c->btree_trans_lock);
+               list_del(&trans->list);
+               mutex_unlock(&c->btree_trans_lock);
+       }
 
        srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
@@ -3098,7 +3097,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
        b = READ_ONCE(trans->locking);
        if (b) {
-               prt_str(out, "  want");
+               prt_printf(out, "  blocked for %lluus on",
+                          div_u64(local_clock() - trans->locking_wait.start_time,
+                                  1000));
                prt_newline(out);
                prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
                bch2_btree_bkey_cached_common_to_text(out, b);
@@ -3112,8 +3113,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-            s++)
+            s++) {
                kfree(s->max_paths_text);
+               bch2_time_stats_exit(&s->lock_hold_times);
+       }
 
        if (c->btree_trans_barrier_initialized)
                cleanup_srcu_struct(&c->btree_trans_barrier);
@@ -3123,11 +3126,16 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
-       unsigned i, nr = BTREE_ITER_MAX;
+       struct btree_transaction_stats *s;
+       unsigned nr = BTREE_ITER_MAX;
        int ret;
 
-       for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
-               mutex_init(&c->btree_transaction_stats[i].lock);
+       for (s = c->btree_transaction_stats;
+            s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+            s++) {
+               bch2_time_stats_init(&s->lock_hold_times);
+               mutex_init(&s->lock);
+       }
 
        INIT_LIST_HEAD(&c->btree_trans_list);
        mutex_init(&c->btree_trans_lock);
index 07c415d572262d2f18ef92ddb1f0f5862bc7904f..0ede02c34eac59ce84828f860ee91a3711fdefa4 100644 (file)
@@ -54,6 +54,16 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
 
 /* Iterate over paths within a transaction: */
 
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
+{
+       if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+           trans->paths_sorted)
+               return;
+       __bch2_btree_trans_sort_paths(trans);
+}
+
 static inline struct btree_path *
 __trans_next_path(struct btree_trans *trans, unsigned idx)
 {
@@ -72,8 +82,6 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
        return &trans->paths[idx];
 }
 
-void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-
 #define trans_for_each_path_from(_trans, _path, _start)                        \
        for (_path = __trans_next_path((_trans), _start);               \
             (_path);                                                   \
@@ -95,9 +103,10 @@ static inline struct btree_path *next_btree_path(struct btree_trans *trans, stru
 
 static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-       return path->sorted_idx
-               ? trans->paths + trans->sorted[path->sorted_idx - 1]
+       unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
+
+       return idx
+               ? trans->paths + trans->sorted[idx - 1]
                : NULL;
 }
 
@@ -106,6 +115,11 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru
             ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
             _i++)
 
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i)         \
+       for (_i = trans->nr_sorted - 1;                                 \
+            ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+            --_i)
+
 static inline bool __path_has_node(const struct btree_path *path,
                                   const struct btree *b)
 {
@@ -161,6 +175,18 @@ bch2_btree_path_set_pos(struct btree_trans *trans,
                : path;
 }
 
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+                                             unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+                                         struct btree_path *path, unsigned flags)
+{
+       if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+               return 0;
+
+       return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
                                          struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
@@ -193,6 +219,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
@@ -201,7 +228,22 @@ static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_co
        return restart_count != trans->restart_count;
 }
 
-void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
+void bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+                                                  u32 restart_count)
+{
+       if (trans_was_restarted(trans, restart_count))
+               bch2_trans_restart_error(trans, restart_count);
+}
+
+void bch2_trans_in_restart_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+{
+       if (trans->restarted)
+               bch2_trans_in_restart_error(trans);
+}
 
 __always_inline
 static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
index dcd1a4796efe29e5260b8ea7e5325ab9bad3dd37..13df0d4086347a5c45ee54a9cf4f7c6504c27ce1 100644 (file)
@@ -12,6 +12,7 @@
 #include "journal_reclaim.h"
 
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 #include <trace/events/bcachefs.h>
 
 static inline bool btree_uses_pcpu_readers(enum btree_id id)
@@ -56,13 +57,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
        if (!six_trylock_intent(&ck->c.lock))
                return false;
 
-       if (!six_trylock_write(&ck->c.lock)) {
+       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                six_unlock_intent(&ck->c.lock);
                return false;
        }
 
-       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               six_unlock_write(&ck->c.lock);
+       if (!six_trylock_write(&ck->c.lock)) {
                six_unlock_intent(&ck->c.lock);
                return false;
        }
@@ -197,6 +197,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
        struct btree_key_cache *bc = &c->btree_key_cache;
        struct bkey_cached *ck = NULL;
        bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+       int ret;
 
        if (!pcpu_readers) {
 #ifdef __KERNEL__
@@ -244,7 +245,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
        if (ck) {
                int ret;
 
-               ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+               ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
                if (unlikely(ret)) {
                        bkey_cached_move_to_freelist(bc, ck);
                        return ERR_PTR(ret);
@@ -264,22 +265,33 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                return ck;
        }
 
-       /* GFP_NOFS because we're holding btree locks: */
-       ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
-       if (likely(ck)) {
-               INIT_LIST_HEAD(&ck->list);
-               __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
-               if (pcpu_readers)
-                       six_lock_pcpu_alloc(&ck->c.lock);
+       ck = kmem_cache_zalloc(bch2_key_cache, GFP_NOWAIT|__GFP_NOWARN);
+       if (likely(ck))
+               goto init;
 
-               ck->c.cached = true;
-               BUG_ON(!six_trylock_intent(&ck->c.lock));
-               BUG_ON(!six_trylock_write(&ck->c.lock));
-               *was_new = true;
-               return ck;
+       bch2_trans_unlock(trans);
+
+       ck = kmem_cache_zalloc(bch2_key_cache, GFP_KERNEL);
+
+       ret = bch2_trans_relock(trans);
+       if (ret) {
+               kmem_cache_free(bch2_key_cache, ck);
+               return ERR_PTR(ret);
        }
 
-       return NULL;
+       if (!ck)
+               return NULL;
+init:
+       INIT_LIST_HEAD(&ck->list);
+       __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+       if (pcpu_readers)
+               six_lock_pcpu_alloc(&ck->c.lock);
+
+       ck->c.cached = true;
+       BUG_ON(!six_trylock_intent(&ck->c.lock));
+       BUG_ON(!six_trylock_write(&ck->c.lock));
+       *was_new = true;
+       return ck;
 }
 
 static struct bkey_cached *
@@ -369,24 +381,23 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                                struct btree_path *ck_path,
                                struct bkey_cached *ck)
 {
-       struct btree_path *path;
+       struct btree_iter iter;
        struct bkey_s_c k;
        unsigned new_u64s = 0;
        struct bkey_i *new_k = NULL;
-       struct bkey u;
        int ret;
 
-       path = bch2_path_get(trans, ck->key.btree_id,
-                            ck->key.pos, 0, 0, 0, _THIS_IP_);
-       ret = bch2_btree_path_traverse(trans, path, 0);
+       bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+                            BTREE_ITER_KEY_CACHE_FILL|
+                            BTREE_ITER_CACHED_NOFILL);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
        if (ret)
                goto err;
 
-       k = bch2_btree_path_peek_slot(path, &u);
-
        if (!bch2_btree_node_relock(trans, ck_path, 0)) {
                trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
                goto err;
        }
 
@@ -405,12 +416,30 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
        if (new_u64s > ck->u64s) {
                new_u64s = roundup_pow_of_two(new_u64s);
-               new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
+               new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
                if (!new_k) {
-                       bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-                               bch2_btree_ids[ck->key.btree_id], new_u64s);
-                       ret = -ENOMEM;
-                       goto err;
+                       bch2_trans_unlock(trans);
+
+                       new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+                       if (!new_k) {
+                               bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+                                       bch2_btree_ids[ck->key.btree_id], new_u64s);
+                               ret = -ENOMEM;
+                               goto err;
+                       }
+
+                       if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+                               kfree(new_k);
+                               trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+                               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+                               goto err;
+                       }
+
+                       ret = bch2_trans_relock(trans);
+                       if (ret) {
+                               kfree(new_k);
+                               goto err;
+                       }
                }
        }
 
@@ -431,9 +460,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
        bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
        /* We're not likely to need this iterator again: */
-       path->preserve = false;
+       set_btree_iter_dontneed(&iter);
 err:
-       bch2_path_put(trans, path, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -449,7 +478,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree
 
        path->l[1].b = NULL;
 
-       if (bch2_btree_node_relock(trans, path, 0)) {
+       if (bch2_btree_node_relock_notrace(trans, path, 0)) {
                ck = (void *) path->l[0].b;
                goto fill;
        }
@@ -487,7 +516,9 @@ retry:
        path->l[0].lock_seq     = ck->c.lock.state.seq;
        path->l[0].b            = (void *) ck;
 fill:
-       if (!ck->valid) {
+       path->uptodate = BTREE_ITER_UPTODATE;
+
+       if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
                /*
                 * Using the underscore version because we haven't set
                 * path->uptodate yet:
@@ -502,17 +533,23 @@ fill:
                ret = btree_key_cache_fill(trans, path, ck);
                if (ret)
                        goto err;
+
+               ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+               if (ret)
+                       goto err;
+
+               path->uptodate = BTREE_ITER_UPTODATE;
        }
 
        if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
                set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-       path->uptodate = BTREE_ITER_UPTODATE;
-       BUG_ON(!ck->valid);
        BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+       BUG_ON(path->uptodate);
 
        return ret;
 err:
+       path->uptodate = BTREE_ITER_NEED_TRAVERSE;
        if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                btree_node_unlock(trans, path, 0);
                path->l[0].b = ERR_PTR(ret);
@@ -531,7 +568,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 
        path->l[1].b = NULL;
 
-       if (bch2_btree_node_relock(trans, path, 0)) {
+       if (bch2_btree_node_relock_notrace(trans, path, 0)) {
                ck = (void *) path->l[0].b;
                goto fill;
        }
@@ -696,6 +733,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
                six_unlock_read(&ck->c.lock);
                goto unlock;
        }
+
+       if (ck->seq != seq) {
+               bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+                                       bch2_btree_key_cache_journal_flush);
+               six_unlock_read(&ck->c.lock);
+               goto unlock;
+       }
        six_unlock_read(&ck->c.lock);
 
        ret = commit_do(&trans, NULL, NULL, 0,
@@ -725,6 +769,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+                                 unsigned flags,
                                  struct btree_path *path,
                                  struct bkey_i *insert)
 {
@@ -734,7 +779,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
        BUG_ON(insert->u64s > ck->u64s);
 
-       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
                int difference;
 
                BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
@@ -757,8 +802,9 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
                        kick_reclaim = true;
        }
 
-       bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
-                               &ck->journal, bch2_btree_key_cache_journal_flush);
+       bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+                            &ck->journal, bch2_btree_key_cache_journal_flush);
+       ck->seq = trans->journal_res.seq;
 
        if (kick_reclaim)
                journal_reclaim_kick(&c->journal);
@@ -978,12 +1024,16 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
        INIT_LIST_HEAD(&c->freed_nonpcpu);
 }
 
-static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
        struct btree_key_cache *bc =
                container_of(shrink, struct btree_key_cache, shrink);
+       char *cbuf;
+       size_t buflen = seq_buf_get_buf(s, &cbuf);
+       struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
 
-       bch2_btree_key_cache_to_text(out, bc);
+       bch2_btree_key_cache_to_text(&out, bc);
+       seq_buf_commit(s, out.pos);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
index eccea15fca792614eb0e9fe782224016cea7df28..c86d5e48f6e33fa9691362804f5537e5af436cbb 100644 (file)
@@ -29,7 +29,7 @@ bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
                                    unsigned);
 
-bool bch2_btree_insert_key_cached(struct btree_trans *,
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
                        struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
index dce2dc0cc0c555a7e34873d05d6b0cd4c14cb191..1ddac23cc50985f4ace1c7dd2eec179c81214c06 100644 (file)
@@ -99,6 +99,12 @@ static void lock_graph_up(struct lock_graph *g)
        closure_put(&g->g[--g->nr].trans->ref);
 }
 
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+       while (g->nr)
+               lock_graph_up(g);
+}
+
 static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 {
        closure_get(&trans->ref);
@@ -274,7 +280,25 @@ next:
                        b = &READ_ONCE(path->l[top->level].b)->c;
 
                        if (IS_ERR_OR_NULL(b)) {
-                               BUG_ON(!lock_graph_remove_non_waiters(&g));
+                               /*
+                                * If we get here, it means we raced with the
+                                * other thread updating its btree_path
+                                * structures - which means it can't be blocked
+                                * waiting on a lock:
+                                */
+                               if (!lock_graph_remove_non_waiters(&g)) {
+                                       /*
+                                        * If lock_graph_remove_non_waiters()
+                                        * didn't do anything, it must be
+                                        * because we're being called by debugfs
+                                        * checking for lock cycles, which
+                                        * invokes us on btree_transactions that
+                                        * aren't actually waiting on anything.
+                                        * Just bail out:
+                                        */
+                                       lock_graph_pop_all(&g);
+                               }
+
                                goto next;
                        }
 
@@ -335,7 +359,8 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
         * locked:
         */
        six_lock_readers_add(&b->lock, -readers);
-       ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
+       ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+                                      lock_may_not_fail, _RET_IP_);
        six_lock_readers_add(&b->lock, readers);
 
        if (ret)
@@ -407,7 +432,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
                return true;
        }
 fail:
-       if (trace)
+       if (trace && !trans->notrace_relock_fail)
                trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
        return false;
 }
@@ -504,6 +529,17 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
        return btree_path_get_locks(trans, path, false);
 }
 
+int __bch2_btree_path_relock(struct btree_trans *trans,
+                       struct btree_path *path, unsigned long trace_ip)
+{
+       if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+               trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+       }
+
+       return 0;
+}
+
 __flatten
 bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
                        struct btree_path *path, unsigned long trace_ip)
@@ -615,6 +651,21 @@ int bch2_trans_relock(struct btree_trans *trans)
        return 0;
 }
 
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       if (unlikely(trans->restarted))
+               return -((int) trans->restarted);
+
+       trans_for_each_path(trans, path)
+               if (path->should_be_locked &&
+                   !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+               }
+       return 0;
+}
+
 void bch2_trans_unlock(struct btree_trans *trans)
 {
        struct btree_path *path;
index fb237c95ee13cb5d91e606207461fbfde5c7e2b7..3e14fe6041fa3c392d79b5b754b063ebc75124f4 100644 (file)
@@ -191,7 +191,8 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 static inline int __btree_node_lock_nopath(struct btree_trans *trans,
                                         struct btree_bkey_cached_common *b,
                                         enum six_lock_type type,
-                                        bool lock_may_not_fail)
+                                        bool lock_may_not_fail,
+                                        unsigned long ip)
 {
        int ret;
 
@@ -199,8 +200,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
        trans->lock_must_abort  = false;
        trans->locking          = b;
 
-       ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
-                                  bch2_six_check_for_deadlock, trans);
+       ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
+                                  bch2_six_check_for_deadlock, trans, ip);
        WRITE_ONCE(trans->locking, NULL);
        WRITE_ONCE(trans->locking_wait.start_time, 0);
        return ret;
@@ -209,16 +210,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 static inline int __must_check
 btree_node_lock_nopath(struct btree_trans *trans,
                       struct btree_bkey_cached_common *b,
-                      enum six_lock_type type)
+                      enum six_lock_type type,
+                      unsigned long ip)
 {
-       return __btree_node_lock_nopath(trans, b, type, false);
+       return __btree_node_lock_nopath(trans, b, type, false, ip);
 }
 
 static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
                                         struct btree_bkey_cached_common *b,
                                         enum six_lock_type type)
 {
-       int ret = __btree_node_lock_nopath(trans, b, type, true);
+       int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
 
        BUG_ON(ret);
 }
@@ -258,7 +260,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 
        if (likely(six_trylock_type(&b->lock, type)) ||
            btree_node_lock_increment(trans, b, level, type) ||
-           !(ret = btree_node_lock_nopath(trans, b, type))) {
+           !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
                path->l[b->level].lock_taken_time = local_clock();
 #endif
@@ -312,6 +314,17 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
 
 bool bch2_btree_path_relock_norestart(struct btree_trans *,
                                      struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+                            struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+                               struct btree_path *path, unsigned long trace_ip)
+{
+       return btree_node_locked(path, path->level)
+               ? 0
+               : __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
 bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
 
 static inline bool bch2_btree_node_relock(struct btree_trans *trans,
@@ -338,17 +351,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
                 __bch2_btree_node_relock(trans, path, level, false));
 }
 
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-                               struct btree_path *path, unsigned long trace_ip)
-{
-       if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
-               trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
-               return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
-       }
-
-       return 0;
-}
-
 /* upgrade */
 
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
index af86ba12e3a4dbc236c7b6fdea9e74c9ec6e90c3..23e7f0cad6e3244562e2e053d236062307829db2 100644 (file)
@@ -10,6 +10,7 @@
 #include "buckets_types.h"
 #include "darray.h"
 #include "journal_types.h"
+#include "replicas_types.h"
 
 struct open_bucket;
 struct btree_update;
@@ -217,6 +218,8 @@ struct btree_node_iter {
 #define BTREE_ITER_ALL_SNAPSHOTS       (1 << 11)
 #define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 12)
 #define BTREE_ITER_NOPRESERVE          (1 << 13)
+#define BTREE_ITER_CACHED_NOFILL       (1 << 14)
+#define BTREE_ITER_KEY_CACHE_FILL      (1 << 15)
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -224,6 +227,10 @@ enum btree_path_uptodate {
        BTREE_ITER_NEED_TRAVERSE        = 2,
 };
 
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
 struct btree_path {
        u8                      idx;
        u8                      sorted_idx;
@@ -254,7 +261,7 @@ struct btree_path {
                u64             lock_taken_time;
 #endif
        }                       l[BTREE_MAX_DEPTH];
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
        unsigned long           ip_allocated;
 #endif
 };
@@ -264,6 +271,15 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
        return path->l + path->level;
 }
 
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+       return path->ip_allocated;
+#else
+       return _THIS_IP_;
+#endif
+}
+
 /*
  * @pos                        - iterator's current position
  * @level              - current btree depth
@@ -297,7 +313,7 @@ struct btree_iter {
        /* BTREE_ITER_WITH_JOURNAL: */
        size_t                  journal_idx;
        struct bpos             journal_pos;
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
        unsigned long           ip_allocated;
 #endif
 };
@@ -344,6 +360,7 @@ struct bkey_cached {
 
        struct journal_preres   res;
        struct journal_entry_pin journal;
+       u64                     seq;
 
        struct bkey_i           *k;
 };
@@ -412,12 +429,14 @@ struct btree_trans {
        u8                      fn_idx;
        u8                      nr_sorted;
        u8                      nr_updates;
-       u8                      traverse_all_idx;
        bool                    used_mempool:1;
        bool                    in_traverse_all:1;
+       bool                    paths_sorted:1;
        bool                    memory_allocation_failure:1;
-       bool                    is_initial_gc:1;
+       bool                    journal_transaction_names:1;
        bool                    journal_replay_not_finished:1;
+       bool                    is_initial_gc:1;
+       bool                    notrace_relock_fail:1;
        enum bch_errcode        restarted:16;
        u32                     restart_count;
        unsigned long           last_restarted_ip;
@@ -437,7 +456,7 @@ struct btree_trans {
        unsigned                mem_bytes;
        void                    *mem;
 
-       u8                      sorted[BTREE_ITER_MAX];
+       u8                      sorted[BTREE_ITER_MAX + 8];
        struct btree_path       *paths;
        struct btree_insert_entry *updates;
 
@@ -450,7 +469,6 @@ struct btree_trans {
        struct journal_preres   journal_preres;
        u64                     *journal_seq;
        struct disk_reservation *disk_res;
-       unsigned                flags;
        unsigned                journal_u64s;
        unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
index 7e9f1f170d5f6bdcedd0585f7263cc053e69b78b..673c3a78aae258cbb7fe9e32d8fca8e86ba0750f 100644 (file)
@@ -80,7 +80,7 @@ int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 
 void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
 
 int bch2_trans_log_msg(struct btree_trans *, const char *, ...);
 int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
@@ -101,9 +101,8 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 {
        trans->disk_res         = disk_res;
        trans->journal_seq      = journal_seq;
-       trans->flags            = flags;
 
-       return __bch2_trans_commit(trans);
+       return __bch2_trans_commit(trans, flags);
 }
 
 #define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)        \
@@ -154,6 +153,14 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
        trans->nr_updates               = 0;
        trans->hooks                    = NULL;
        trans->extra_journal_entries.nr = 0;
+
+       if (trans->fs_usage_deltas) {
+               trans->fs_usage_deltas->used = 0;
+               memset((void *) trans->fs_usage_deltas +
+                      offsetof(struct replicas_delta_list, memset_start), 0,
+                      (void *) &trans->fs_usage_deltas->memset_end -
+                      (void *) &trans->fs_usage_deltas->memset_start);
+       }
 }
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index a49e7b6b416d95355dc3a929a6c9e6b4f00e03b3..09aeee06a137a4d1bc28e70af9bc70386dde405b 100644 (file)
@@ -2032,7 +2032,7 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 
        bch2_trans_do(c, NULL, NULL, 0,
                      async_btree_node_rewrite_trans(&trans, a));
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
        kfree(a);
 }
 
@@ -2040,12 +2040,12 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
        struct async_btree_rewrite *a;
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite))
                return;
 
        a = kmalloc(sizeof(*a), GFP_NOFS);
        if (!a) {
-               percpu_ref_put(&c->writes);
+               bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
                return;
        }
 
@@ -2102,7 +2102,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 
                btree_path_set_level_up(trans, iter2.path);
 
-               bch2_btree_path_check_sort(trans, iter2.path, 0);
+               trans->paths_sorted = false;
 
                ret   = bch2_btree_iter_traverse(&iter2) ?:
                        bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
index a2b37dd45cf8f6d5132a9a9d224c2f5fc1a3c679..f01a2e90a4ec361eccb9a7a8bf386e9b885020e3 100644 (file)
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+       if (k.k && bpos_eq(path->pos, k.k->p))
+               return k;
+
+       bkey_init(u);
+       u->p = path->pos;
+       return (struct bkey_s_c) { u, NULL };
+}
+
 static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct bch_fs *c = trans->c;
        struct bkey u;
-       struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
+       struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
 
        if (unlikely(trans->journal_replay_not_finished)) {
                struct bkey_i *j_k =
@@ -314,17 +330,15 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-                                            unsigned flags)
+                                                     unsigned flags)
 {
        return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-                                   trans->journal_u64s,
-                                   flags|
-                                   (trans->flags & JOURNAL_WATERMARK_MASK));
+                                   trans->journal_u64s, flags);
 }
 
 #define JSET_ENTRY_LOG_U64s            4
 
-static void journal_transaction_name(struct btree_trans *trans)
+static noinline void journal_transaction_name(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
@@ -349,9 +363,8 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
        return 0;
 }
 
-static int btree_key_can_insert_cached(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      unsigned u64s)
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+                                      struct btree_path *path, unsigned u64s)
 {
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck = (void *) path->l[0].b;
@@ -363,7 +376,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans,
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
            bch2_btree_key_cache_must_wait(c) &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
+           !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
                return -BCH_ERR_btree_insert_need_journal_reclaim;
 
        /*
@@ -573,7 +586,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 }
 
 static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans,
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                               struct btree_insert_entry **stopped_at,
                               unsigned long trace_ip)
 {
@@ -613,7 +626,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                u64s += i->k->k.u64s;
                ret = !i->cached
                        ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-                       : btree_key_can_insert_cached(trans, i->path, u64s);
+                       : btree_key_can_insert_cached(trans, flags, i->path, u64s);
                if (ret) {
                        *stopped_at = i;
                        return ret;
@@ -627,13 +640,15 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
         * Don't get journal reservation until after we know insert will
         * succeed:
         */
-       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
                ret = bch2_trans_journal_res_get(trans,
+                               (flags & JOURNAL_WATERMARK_MASK)|
                                JOURNAL_RES_GET_NONBLOCK);
                if (ret)
                        return ret;
 
-               journal_transaction_name(trans);
+               if (unlikely(trans->journal_transaction_names))
+                       journal_transaction_name(trans);
        } else {
                trans->journal_res.seq = c->journal.replay_journal_seq;
        }
@@ -644,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
         */
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+           !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
                if (bch2_journal_seq_verify)
                        trans_for_each_update(trans, i)
                                i->k->k.version.lo = trans->journal_res.seq;
@@ -679,7 +694,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                trans->journal_res.u64s         -= trans->extra_journal_entries.nr;
        }
 
-       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
                trans_for_each_update(trans, i) {
                        struct journal *j = &c->journal;
                        struct jset_entry *entry;
@@ -687,14 +702,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                        if (i->key_cache_already_flushed)
                                continue;
 
+                       if (i->flags & BTREE_UPDATE_NOJOURNAL)
+                               continue;
+
                        verify_update_old_key(trans, i);
 
-                       entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                              BCH_JSET_ENTRY_overwrite,
-                                              i->btree_id, i->level,
-                                              i->old_k.u64s);
-                       bkey_reassemble(&entry->start[0],
-                                       (struct bkey_s_c) { &i->old_k, i->old_v });
+                       if (trans->journal_transaction_names) {
+                               entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                                      BCH_JSET_ENTRY_overwrite,
+                                                      i->btree_id, i->level,
+                                                      i->old_k.u64s);
+                               bkey_reassemble(&entry->start[0],
+                                               (struct bkey_s_c) { &i->old_k, i->old_v });
+                       }
 
                        entry = bch2_journal_add_entry(j, &trans->journal_res,
                                               BCH_JSET_ENTRY_btree_keys,
@@ -713,7 +733,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                if (!i->cached)
                        btree_insert_key_leaf(trans, i);
                else if (!i->key_cache_already_flushed)
-                       bch2_btree_insert_key_cached(trans, i->path, i->k);
+                       bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
                else {
                        bch2_btree_key_cache_drop(trans, i->path);
                        btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
@@ -762,12 +782,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
                                                   struct btree_insert_entry *i,
                                                   struct printbuf *err)
 {
        struct bch_fs *c = trans->c;
-       int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+       int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
        printbuf_reset(err);
        prt_printf(err, "invalid bkey on insert from %s -> %ps",
@@ -793,7 +813,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
-static inline int do_bch2_trans_commit(struct btree_trans *trans,
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
                                       struct btree_insert_entry **stopped_at,
                                       unsigned long trace_ip)
 {
@@ -804,11 +824,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans_for_each_update(trans, i) {
-               int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+               int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
                if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
                                               i->bkey_type, rw, &buf)))
-                       return bch2_trans_commit_bkey_invalid(trans, i, &buf);
+                       return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
                btree_insert_entry_checks(trans, i);
        }
 #endif
@@ -824,7 +844,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                if (!same_leaf_as_next(trans, i)) {
                        if (u64s_delta <= 0) {
                                ret = bch2_foreground_maybe_merge(trans, i->path,
-                                                       i->level, trans->flags);
+                                                       i->level, flags);
                                if (unlikely(ret))
                                        return ret;
                        }
@@ -835,8 +855,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
-                       JOURNAL_RES_GET_NONBLOCK|
-                       (trans->flags & JOURNAL_WATERMARK_MASK));
+                       (flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
        if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
                ret = bch2_trans_journal_preres_get_cold(trans,
                                                trans->journal_preres_u64s, trace_ip);
@@ -847,7 +866,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        if (unlikely(ret))
                return ret;
 
-       ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
+       ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
 
        if (!ret && unlikely(trans->journal_replay_not_finished))
                bch2_drop_overwrites_from_journal(trans);
@@ -886,7 +905,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
 }
 
 static noinline
-int bch2_trans_commit_error(struct btree_trans *trans,
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                            struct btree_insert_entry *i,
                            int ret, unsigned long trace_ip)
 {
@@ -894,7 +913,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
        switch (ret) {
        case -BCH_ERR_btree_insert_btree_node_full:
-               ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
+               ret = bch2_btree_split_leaf(trans, i->path, flags);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
                break;
@@ -912,8 +931,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        case -BCH_ERR_journal_res_get_blocked:
                bch2_trans_unlock(trans);
 
-               if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-                   !(trans->flags & JOURNAL_WATERMARK_reserved)) {
+               if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+                   !(flags & JOURNAL_WATERMARK_reserved)) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        break;
                }
@@ -948,20 +967,20 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 
        bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-                               !(trans->flags & BTREE_INSERT_NOWAIT) &&
-                               (trans->flags & BTREE_INSERT_NOFAIL), c,
+                               !(flags & BTREE_INSERT_NOWAIT) &&
+                               (flags & BTREE_INSERT_NOFAIL), c,
                "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
 
        return ret;
 }
 
 static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
 {
        struct bch_fs *c = trans->c;
        int ret;
 
-       if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+       if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
            test_bit(BCH_FS_STARTED, &c->flags))
                return -BCH_ERR_erofs_trans_commit;
 
@@ -972,7 +991,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        if (ret)
                return ret;
 
-       percpu_ref_get(&c->writes);
+       bch2_write_ref_get(c, BCH_WRITE_REF_trans);
        return 0;
 }
 
@@ -997,7 +1016,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
        return ret;
 }
 
-int __bch2_trans_commit(struct btree_trans *trans)
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i = NULL;
@@ -1008,7 +1027,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
            !trans->extra_journal_entries.nr)
                goto out_reset;
 
-       if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+       if (flags & BTREE_INSERT_GC_LOCK_HELD)
                lockdep_assert_held(&c->gc_lock);
 
        ret = bch2_trans_commit_run_triggers(trans);
@@ -1020,9 +1039,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
                goto out_reset;
        }
 
-       if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-           unlikely(!percpu_ref_tryget_live(&c->writes))) {
-               ret = bch2_trans_commit_get_rw_cold(trans);
+       if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+           unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+               ret = bch2_trans_commit_get_rw_cold(trans, flags);
                if (ret)
                        goto out_reset;
        }
@@ -1034,8 +1053,10 @@ int __bch2_trans_commit(struct btree_trans *trans)
        trans->journal_u64s             = trans->extra_journal_entries.nr;
        trans->journal_preres_u64s      = 0;
 
-       /* For journalling transaction name: */
-       trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+       if (trans->journal_transaction_names)
+               trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
        trans_for_each_update(trans, i) {
                EBUG_ON(!i->path->should_be_locked);
@@ -1052,27 +1073,32 @@ int __bch2_trans_commit(struct btree_trans *trans)
                /* we're going to journal the key being updated: */
                u64s = jset_u64s(i->k->k.u64s);
                if (i->cached &&
-                   likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
+                   likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
                        trans->journal_preres_u64s += u64s;
+
+               if (i->flags & BTREE_UPDATE_NOJOURNAL)
+                       continue;
+
                trans->journal_u64s += u64s;
 
                /* and we're also going to log the overwrite: */
-               trans->journal_u64s += jset_u64s(i->old_k.u64s);
+               if (trans->journal_transaction_names)
+                       trans->journal_u64s += jset_u64s(i->old_k.u64s);
        }
 
        if (trans->extra_journal_res) {
                ret = bch2_disk_reservation_add(c, trans->disk_res,
                                trans->extra_journal_res,
-                               (trans->flags & BTREE_INSERT_NOFAIL)
+                               (flags & BTREE_INSERT_NOFAIL)
                                ? BCH_DISK_RESERVATION_NOFAIL : 0);
                if (ret)
                        goto err;
        }
 retry:
-       EBUG_ON(trans->restarted);
+       bch2_trans_verify_not_in_restart(trans);
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-       ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
+       ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
 
        /* make sure we didn't drop or screw up locks: */
        bch2_trans_verify_locks(trans);
@@ -1084,22 +1110,14 @@ retry:
 out:
        bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
-       if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-               percpu_ref_put(&c->writes);
+       if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+               bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
        bch2_trans_reset_updates(trans);
 
-       if (trans->fs_usage_deltas) {
-               trans->fs_usage_deltas->used = 0;
-               memset((void *) trans->fs_usage_deltas +
-                      offsetof(struct replicas_delta_list, memset_start), 0,
-                      (void *) &trans->fs_usage_deltas->memset_end -
-                      (void *) &trans->fs_usage_deltas->memset_start);
-       }
-
        return ret;
 err:
-       ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
+       ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
        if (ret)
                goto out;
 
@@ -1152,12 +1170,63 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
        return __check_pos_snapshot_overwritten(trans, id, pos);
 }
 
+static noinline int extent_front_merge(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      struct bkey_s_c k,
+                                      struct bkey_i **insert,
+                                      enum btree_update_flags flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *update;
+       int ret;
+
+       update = bch2_bkey_make_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(update);
+       if (ret)
+               return ret;
+
+       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+               return 0;
+
+       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
+               check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       ret = bch2_btree_delete_at(trans, iter, flags);
+       if (ret)
+               return ret;
+
+       *insert = update;
+       return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+                                     struct btree_iter *iter,
+                                     struct bkey_i *insert,
+                                     struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       int ret;
+
+       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
+               check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       return 0;
+}
+
 int bch2_trans_update_extent(struct btree_trans *trans,
                             struct btree_iter *orig_iter,
                             struct bkey_i *insert,
                             enum btree_update_flags flags)
 {
-       struct bch_fs *c = trans->c;
        struct btree_iter iter, update_iter;
        struct bpos start = bkey_start_pos(&insert->k);
        struct bkey_i *update;
@@ -1175,46 +1244,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
        if (!k.k)
                goto out;
 
-       if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-               /*
-                * We can't merge extents if they belong to interior snapshot
-                * tree nodes, and there's a snapshot in which one extent is
-                * visible and the other is not - i.e. if visibility is
-                * different.
-                *
-                * Instead of checking if visibilitiy of the two extents is
-                * different, for now we just check if either has been
-                * overwritten:
-                */
-               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
-               if (ret < 0)
-                       goto err;
-               if (ret)
-                       goto nomerge1;
-
-               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
-               if (ret < 0)
-                       goto err;
-               if (ret)
-                       goto nomerge1;
-
-               update = bch2_bkey_make_mut(trans, k);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       goto err;
-
-               if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
-                       ret = bch2_btree_delete_at(trans, &iter, flags);
+       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
                        if (ret)
                                goto err;
-
-                       insert = update;
-                       goto next;
                }
-       }
-nomerge1:
-       ret = 0;
-       if (bkey_eq(k.k->p, start))
+
                goto next;
+       }
 
        while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
                bool front_split = bkey_lt(bkey_start_pos(k.k), start);
@@ -1323,22 +1361,10 @@ next:
        }
 
        if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
-               if (ret < 0)
-                       goto err;
+               ret = extent_back_merge(trans, &iter, insert, k);
                if (ret)
-                       goto nomerge2;
-
-               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
-               if (ret < 0)
                        goto err;
-               if (ret)
-                       goto nomerge2;
-
-               bch2_bkey_merge(c, bkey_i_to_s(insert), k);
        }
-nomerge2:
-       ret = 0;
 out:
        if (!bkey_deleted(&insert->k)) {
                /*
@@ -1476,7 +1502,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
                array_insert_item(trans->updates, trans->nr_updates,
                                  i - trans->updates, n);
 
-               i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
                i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
 
                if (unlikely(trans->journal_replay_not_finished)) {
@@ -1499,7 +1525,9 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
         * the key cache - but the key has to exist in the btree for that to
         * work:
         */
-       if (unlikely(path->cached && bkey_deleted(&i->old_k)))
+       if (path->cached &&
+           bkey_deleted(&i->old_k) &&
+           !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
                return flush_new_cached_update(trans, path, i, flags, ip);
 
        return 0;
@@ -1671,18 +1699,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                 */
                delete.k.p = iter.pos;
 
-               if (iter.flags & BTREE_ITER_IS_EXTENTS) {
-                       unsigned max_sectors =
-                               KEY_SIZE_MAX & (~0 << trans->c->block_bits);
-
-                       /* create the biggest key we can */
-                       bch2_key_resize(&delete.k, max_sectors);
-                       bch2_cut_back(end, &delete);
-
-                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-                       if (ret)
-                               goto err;
-               }
+               if (iter.flags & BTREE_ITER_IS_EXTENTS)
+                       bch2_key_resize(&delete.k,
+                                       bpos_min(end, k.k->p).offset -
+                                       iter.pos.offset);
 
                ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
                        bch2_trans_commit(trans, &disk_res, journal_seq,
index 428c2be670a1887a0c2c538e911769b088b0833f..ac0328c4f2c18c6c7b22641cf1f3a45267608385 100644 (file)
@@ -137,23 +137,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
        struct bch_fs_usage_online *ret;
-       unsigned seq, i, u64s;
+       unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
+retry:
+       ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+       if (unlikely(!ret))
+               return NULL;
 
        percpu_down_read(&c->mark_lock);
 
-       ret = kmalloc(sizeof(struct bch_fs_usage_online) +
-                     sizeof(u64) * c->replicas.nr, GFP_NOFS);
-       if (unlikely(!ret)) {
+       v = fs_usage_u64s(c) + 1;
+       if (unlikely(u64s != v)) {
+               u64s = v;
                percpu_up_read(&c->mark_lock);
-               return NULL;
+               kfree(ret);
+               goto retry;
        }
 
        ret->online_reserved = percpu_u64_get(c->online_reserved);
 
-       u64s = fs_usage_u64s(c);
        do {
                seq = read_seqcount_begin(&c->usage_lock);
-               memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+               unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+                             "embedded variable length struct");
                for (i = 0; i < ARRAY_SIZE(c->usage); i++)
                        acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
        } while (read_seqcount_retry(&c->usage_lock, seq));
@@ -1203,17 +1208,23 @@ not_found:
                     "  missing range %llu-%llu",
                     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
                     *idx, next_idx)) {
-               struct bkey_i_error new;
-
-               bkey_init(&new.k);
-               new.k.type      = KEY_TYPE_error;
-               new.k.p         = bkey_start_pos(p.k);
-               new.k.p.offset += *idx - start;
-               bch2_key_resize(&new.k, next_idx - *idx);
-               ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
+               struct bkey_i_error *new;
+
+               new = bch2_trans_kmalloc(trans, sizeof(*new));
+               ret = PTR_ERR_OR_ZERO(new);
+               if (ret)
+                       goto err;
+
+               bkey_init(&new->k);
+               new->k.type     = KEY_TYPE_error;
+               new->k.p                = bkey_start_pos(p.k);
+               new->k.p.offset += *idx - start;
+               bch2_key_resize(&new->k, next_idx - *idx);
+               ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i);
        }
 
        *idx = next_idx;
+err:
 fsck_err:
        printbuf_exit(&buf);
        return ret;
@@ -1258,36 +1269,6 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
        return ret;
 }
 
-static noinline __cold
-void fs_usage_apply_warn(struct btree_trans *trans,
-                        unsigned disk_res_sectors,
-                        s64 should_not_have_added)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i;
-       struct printbuf buf = PRINTBUF;
-
-       prt_printf(&buf,
-                  bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
-                  should_not_have_added, disk_res_sectors);
-
-       trans_for_each_update(trans, i) {
-               struct bkey_s_c old = { &i->old_k, i->old_v };
-
-               prt_str(&buf, "new ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-               prt_newline(&buf);
-
-               prt_str(&buf, "old ");
-               bch2_bkey_val_to_text(&buf, c, old);
-               prt_newline(&buf);
-       }
-
-       __WARN();
-       bch2_print_string_as_lines(KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-}
-
 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
                              struct replicas_delta_list *deltas)
 {
@@ -1352,7 +1333,9 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
        percpu_up_read(&c->mark_lock);
 
        if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-               fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+               bch2_trans_inconsistent(trans,
+                                       "disk usage increased %lli more than %u sectors reserved)",
+                                       should_not_have_added, disk_res_sectors);
        return 0;
 need_mark:
        /* revert changes: */
index 0f4ef9e5a431f4f84e80a347196a03f7d71a8946..f3774e30b5cdf12b4bf85c6a9fc8dfc9e367360a 100644 (file)
@@ -2,28 +2,24 @@
 
 #include "bcachefs.h"
 #include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
 #include <linux/random.h>
-#include <linux/siphash.h>
 
 static inline struct bucket_hashed *
 bucket_hash(struct buckets_waiting_for_journal_table *t,
            unsigned hash_seed_idx, u64 dev_bucket)
 {
-       unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
-
-       EBUG_ON(!is_power_of_2(t->size));
-
-       return t->d + (h & (t->size - 1));
+       return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
 }
 
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
 {
        unsigned i;
 
-       t->size = size;
+       t->bits = bits;
        for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
                get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
-       memset(t->d, 0, sizeof(t->d[0]) * size);
+       memset(t->d, 0, sizeof(t->d[0]) << t->bits);
 }
 
 bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
@@ -97,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
                .dev_bucket     = (u64) dev << 56 | bucket,
                .journal_seq    = journal_seq,
        };
-       size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
+       size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
        int ret = 0;
 
        mutex_lock(&b->lock);
@@ -106,12 +102,13 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
                goto out;
 
        t = b->t;
-       for (i = 0; i < t->size; i++)
+       size = 1UL << t->bits;
+       for (i = 0; i < size; i++)
                nr_elements += t->d[i].journal_seq > flushed_seq;
 
-       new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
+       new_bits = t->bits + (nr_elements * 3 > size);
 
-       n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
+       n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
        if (!n) {
                ret = -ENOMEM;
                goto out;
@@ -119,12 +116,12 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 
 retry_rehash:
        nr_rehashes++;
-       bucket_table_init(n, new_size);
+       bucket_table_init(n, new_bits);
 
        tmp = new;
        BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
 
-       for (i = 0; i < t->size; i++) {
+       for (i = 0; i < 1UL << t->bits; i++) {
                if (t->d[i].journal_seq <= flushed_seq)
                        continue;
 
@@ -137,7 +134,7 @@ retry_rehash:
        kvfree(t);
 
        pr_debug("took %zu rehashes, table at %zu/%zu elements",
-                nr_rehashes, nr_elements, b->t->size);
+                nr_rehashes, nr_elements, 1UL << b->t->bits);
 out:
        mutex_unlock(&b->lock);
 
@@ -151,7 +148,7 @@ void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
        kvfree(b->t);
 }
 
-#define INITIAL_TABLE_SIZE     8
+#define INITIAL_TABLE_BITS             3
 
 int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
 {
@@ -159,10 +156,11 @@ int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
 
        mutex_init(&b->lock);
 
-       b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
+       b->t = kvmalloc(sizeof(*b->t) +
+                       (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
        if (!b->t)
                return -ENOMEM;
 
-       bucket_table_init(b->t, INITIAL_TABLE_SIZE);
+       bucket_table_init(b->t, INITIAL_TABLE_BITS);
        return 0;
 }
index fea7f944d0ed38e61cb4e4fc38319c29d4a913b4..e593db061d81b2ee59f30e99bb219116c494681f 100644 (file)
@@ -10,8 +10,8 @@ struct bucket_hashed {
 };
 
 struct buckets_waiting_for_journal_table {
-       size_t                  size;
-       siphash_key_t           hash_seeds[3];
+       unsigned                bits;
+       u64                     hash_seeds[3];
        struct bucket_hashed    d[];
 };
 
index 7ef7bb613347ea85bda4fafd1ae6ebcad94f0075..9df958b43dfe5a31deefc238aaa602b1a48f13da 100644 (file)
@@ -182,7 +182,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 
                /* Add new ptrs: */
                extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-                       if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+                       const struct bch_extent_ptr *existing_ptr =
+                               bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev);
+
+                       if (existing_ptr && existing_ptr->cached) {
+                               /*
+                                * We're replacing a cached pointer with a non
+                                * cached pointer:
+                                */
+                               bch2_bkey_drop_device_noerror(bkey_i_to_s(insert),
+                                                             existing_ptr->dev);
+                       } else if (existing_ptr) {
                                /*
                                 * raced with another move op? extent already
                                 * has a pointer to the device we just wrote
@@ -253,8 +263,8 @@ nomatch:
                                     &m->ctxt->stats->sectors_raced);
                }
 
-               this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size);
-               trace_move_extent_race(&new->k);
+               this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size);
+               trace_move_extent_fail(&new->k);
 
                bch2_btree_iter_advance(&iter);
                goto next;
@@ -388,17 +398,21 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
        }
 }
 
-int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
+int bch2_data_update_init(struct btree_trans *trans,
+                         struct moving_context *ctxt,
+                         struct data_update *m,
                          struct write_point_specifier wp,
                          struct bch_io_opts io_opts,
                          struct data_update_opts data_opts,
                          enum btree_id btree_id,
                          struct bkey_s_c k)
 {
+       struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
        unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+       unsigned int ptrs_locked = 0;
        int ret;
 
        bch2_bkey_buf_init(&m->k);
@@ -424,11 +438,14 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 
        i = 0;
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               bool locked;
+
                if (((1U << i) & m->data_opts.rewrite_ptrs) &&
                    p.ptr.cached)
                        BUG();
 
-               if (!((1U << i) & m->data_opts.rewrite_ptrs))
+               if (!((1U << i) & m->data_opts.rewrite_ptrs) &&
+                   !p.ptr.cached)
                        bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 
                if (((1U << i) & m->data_opts.rewrite_ptrs) &&
@@ -448,10 +465,24 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
                if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
                        m->op.incompressible = true;
 
+               if (ctxt) {
+                       move_ctxt_wait_event(ctxt, trans,
+                                       (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+                                                                 PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+                                       !atomic_read(&ctxt->read_sectors));
+
+                       if (!locked)
+                               bch2_bucket_nocow_lock(&c->nocow_locks,
+                                                      PTR_BUCKET_POS(c, &p.ptr), 0);
+               } else {
+                       if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+                                                      PTR_BUCKET_POS(c, &p.ptr), 0)) {
+                               ret = -BCH_ERR_nocow_lock_blocked;
+                               goto err;
+                       }
+               }
+               ptrs_locked |= (1U << i);
                i++;
-
-               bch2_bucket_nocow_lock(&c->nocow_locks,
-                                      PTR_BUCKET_POS(c, &p.ptr), 0);
        }
 
        if (reserve_sectors) {
@@ -473,9 +504,13 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
                return -BCH_ERR_unwritten_extent_update;
        return 0;
 err:
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                      PTR_BUCKET_POS(c, &p.ptr), 0);
+       i = 0;
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               if ((1U << i) & ptrs_locked)
+                       bch2_bucket_nocow_unlock(&c->nocow_locks,
+                                               PTR_BUCKET_POS(c, &p.ptr), 0);
+               i++;
+       }
 
        bch2_bkey_buf_exit(&m->k, c);
        bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
index f304c33662266ec1edf563c7c424ef9c250ff1d1..49e9055cbb5262a642532ea75856b1df10e5101f 100644 (file)
@@ -33,7 +33,8 @@ void bch2_data_update_read_done(struct data_update *,
 
 void bch2_data_update_exit(struct data_update *);
 void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
-int bch2_data_update_init(struct bch_fs *, struct data_update *,
+int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+                         struct data_update *,
                          struct write_point_specifier,
                          struct bch_io_opts, struct data_update_opts,
                          enum btree_id, struct bkey_s_c);
index 3c2f008d23f82be2dcc6c30feab025217deee103..fcefd55a5322474405ae9558a7008439aa51e309 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/console.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
-#include <linux/pretty-printers.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 
@@ -40,7 +39,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
        struct bset *sorted, *inmemory = &b->data->keys;
        struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
        struct bio *bio;
-       bool failed = false;
+       bool failed = false, saw_error = false;
 
        if (!bch2_dev_get_ioref(ca, READ))
                return false;
@@ -61,7 +60,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
        memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
        v->written = 0;
-       if (bch2_btree_node_read_done(c, ca, v, false))
+       if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
                return false;
 
        n_sorted = c->verify_data->data;
@@ -501,6 +500,7 @@ static const struct file_operations cached_btree_nodes_ops = {
        .read           = bch2_cached_btree_nodes_read,
 };
 
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
                                            size_t size, loff_t *ppos)
 {
@@ -520,7 +520,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 
                ret = flush_buf(i);
                if (ret)
-                       return ret;
+                       break;
 
                bch2_btree_trans_to_text(&i->buf, trans);
 
@@ -550,6 +550,7 @@ static const struct file_operations btree_transactions_ops = {
        .release        = bch2_dump_release,
        .read           = bch2_btree_transactions_read,
 };
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
 
 static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
                                      size_t size, loff_t *ppos)
@@ -710,7 +711,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 
                ret = flush_buf(i);
                if (ret)
-                       return ret;
+                       break;
 
                bch2_check_for_deadlock(trans, &i->buf);
 
@@ -756,8 +757,10 @@ void bch2_fs_debug_init(struct bch_fs *c)
        debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
                            c->btree_debug, &cached_btree_nodes_ops);
 
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
        debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
                            c->btree_debug, &btree_transactions_ops);
+#endif
 
        debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
                            c->btree_debug, &journal_pins_ops);
index f1838b7c45eeafd752f6ed229e97dc5d445c9771..4c85d3399fb4cf4b93f5467ec50c8d409c05b25d 100644 (file)
@@ -84,7 +84,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 };
 
 int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                       int rw, struct printbuf *err)
+                       unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
        unsigned len;
index 1a2c9108f864ee5eee32472ef4b133f1030f8786..ad131e8edc29649da8a544f71d95740a13bfa480 100644 (file)
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {      \
index c234c8d5d6a311bbcda9848e98970ca59b6ba309..9f5a27dab351d7c67cd0f9495ba0d8dffbd71454 100644 (file)
@@ -104,7 +104,7 @@ struct ec_bio {
 /* Stripes btree keys: */
 
 int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                       int rw, struct printbuf *err)
+                       unsigned flags, struct printbuf *err)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
@@ -130,7 +130,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       return bch2_bkey_ptrs_invalid(c, k, rw, err);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -673,9 +673,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
        heap_verify_backpointer(c, idx);
 
-       if (stripe_idx_to_delete(c) >= 0 &&
-           !percpu_ref_is_dying(&c->writes))
-               schedule_work(&c->ec_stripe_delete_work);
+       if (stripe_idx_to_delete(c) >= 0)
+               bch2_do_stripe_deletes(c);
 }
 
 /* stripe deletion */
@@ -708,6 +707,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
                if (ec_stripe_delete(c, idx))
                        break;
        }
+
+       bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+           !schedule_work(&c->ec_stripe_delete_work))
+               bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
 /* stripe creation: */
@@ -965,7 +973,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        BUG_ON(!s->allocated);
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_create))
                goto err;
 
        ec_generate_ec(&s->new_stripe);
@@ -1003,7 +1011,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
        spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 err:
        bch2_disk_reservation_put(c, &s->res);
 
index aba1e82bc889f7e7c6d878d7831fb0dc0c99b20a..d47da7d86fe7d4162a21cb85405c90da43c36971 100644 (file)
@@ -4,9 +4,10 @@
 
 #include "ec_types.h"
 #include "buckets_types.h"
+#include "extents_types.h"
 
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
-                       int rw, struct printbuf *);
+                       unsigned, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
                         struct bkey_s_c);
 
@@ -206,6 +207,8 @@ void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
+void bch2_do_stripe_deletes(struct bch_fs *);
+
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
index 6217096494e6b085b65c7440363dd16774681ec1..b8c24f517f0316d80b70d39464b407973605ccf6 100644 (file)
        x(BCH_ERR_invalid_sb,           invalid_sb_clean)                       \
        x(BCH_ERR_invalid_sb,           invalid_sb_quota)                       \
        x(BCH_ERR_invalid,              invalid_bkey)                           \
+       x(BCH_ERR_operation_blocked,    nocow_lock_blocked)                     \
 
 enum bch_errcode {
        BCH_ERR_START           = 2048,
index 3e49d72d65b5e8db791cfa6254529e9217fce4e6..c2882c599896cc694c9e0180bf4abf4e90f05e0a 100644 (file)
@@ -27,8 +27,11 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 
 void bch2_topology_error(struct bch_fs *c)
 {
+       if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+               return;
+
        set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
-       if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+       if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
                bch2_inconsistent_error(c);
 }
 
index dae72620dae36e4bc8b23298010766f9e2b074c3..9991879dfbff12bb36119646d4222423f09bfd2b 100644 (file)
@@ -73,8 +73,8 @@ do {                                                                  \
 #define bch2_trans_inconsistent(trans, ...)                            \
 ({                                                                     \
        bch_err(trans->c, __VA_ARGS__);                                 \
-       bch2_inconsistent_error(trans->c);                              \
        bch2_dump_trans_updates(trans);                                 \
+       bch2_inconsistent_error(trans->c);                              \
 })
 
 #define bch2_trans_inconsistent_on(cond, trans, ...)                   \
index 9b197db78260b0997f056361a179398ee2cce247..d01cec89603bfbaa1963d2995e6349caf5f158bd 100644 (file)
@@ -166,7 +166,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          int rw, struct printbuf *err)
+                          unsigned flags, struct printbuf *err)
 {
        if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
                prt_printf(err, "value too big (%zu > %u)",
@@ -174,7 +174,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       return bch2_bkey_ptrs_invalid(c, k, rw, err);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -184,7 +184,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                             int rw, struct printbuf *err)
+                             unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
@@ -207,7 +207,7 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       return bch2_bkey_ptrs_invalid(c, k, rw, err);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -389,7 +389,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                            int rw, struct printbuf *err)
+                            unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
@@ -715,7 +715,7 @@ static inline void __extent_entry_insert(struct bkey_i *k,
        memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
                              dst, (u64 *) end - (u64 *) dst);
        k->k.u64s += extent_entry_u64s(new);
-       memcpy(dst, new, extent_entry_bytes(new));
+       memcpy_u64s_small(dst, new, extent_entry_u64s(new));
 }
 
 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
@@ -1086,7 +1086,7 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 }
 
 int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          int rw, struct printbuf *err)
+                          unsigned flags, struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
index e27d39b728b3628f55fb966f18773921189baf03..1d8f3b309b074ba40394e2fd954744a658b4336f 100644 (file)
@@ -371,11 +371,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
                              int, struct bkey_s);
@@ -414,7 +414,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
-                            int, struct printbuf *);
+                            unsigned, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -659,7 +659,7 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
-                          int, struct printbuf *);
+                          unsigned, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
index e7ebb01b4d09edc09dedd625c9a6e53166923024..e088bbcccc8d90f67693463ae3316cac91f7b798 100644 (file)
@@ -812,7 +812,7 @@ static void bch2_set_page_dirty(struct bch_fs *c,
        i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
        if (!PageDirty(page))
-               __set_page_dirty_nobuffers(page);
+               filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
 }
 
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
@@ -2715,7 +2715,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
         * redirty the full page:
         */
        page_mkclean(page);
-       __set_page_dirty_nobuffers(page);
+       filemap_dirty_folio(mapping, page_folio(page));
 unlock:
        unlock_page(page);
        put_page(page);
@@ -3280,7 +3280,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        long ret;
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
                return -EROFS;
 
        inode_lock(&inode->v);
@@ -3304,7 +3304,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 err:
        bch2_pagecache_block_put(inode);
        inode_unlock(&inode->v);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
 
        return bch2_err_class(ret);
 }
@@ -3448,9 +3448,9 @@ err:
 
 /* fseek: */
 
-static int page_data_offset(struct page *page, unsigned offset)
+static int folio_data_offset(struct folio *folio, unsigned offset)
 {
-       struct bch_page_state *s = bch2_page_state(page);
+       struct bch_page_state *s = bch2_page_state(&folio->page);
        unsigned i;
 
        if (s)
@@ -3481,8 +3481,7 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
                        struct folio *folio = fbatch.folios[i];
 
                        folio_lock(folio);
-
-                       offset = page_data_offset(&folio->page,
+                       offset = folio_data_offset(folio,
                                        folio->index == start_index
                                        ? start_offset & (PAGE_SIZE - 1)
                                        : 0);
@@ -3494,7 +3493,6 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
                                folio_batch_release(&fbatch);
                                return ret;
                        }
-
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
index cbd9b1e783601a2b6591147473b92f1dd99b60e3..c23309f1a1aeac7226e0565865bc3f482b8a83ed 100644 (file)
@@ -667,10 +667,10 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
        qid = inode->ei_qid;
 
        if (attr->ia_valid & ATTR_UID)
-               qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
+               qid.q[QTYP_USR] = from_kuid(mnt_userns, attr->ia_uid);
 
        if (attr->ia_valid & ATTR_GID)
-               qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
+               qid.q[QTYP_GRP] = from_kgid(mnt_userns, attr->ia_gid);
 
        ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
                                     KEY_TYPE_QUOTA_PREALLOC);
@@ -779,18 +779,19 @@ static int bch2_setattr(struct user_namespace *mnt_userns,
 }
 
 static int bch2_tmpfile(struct user_namespace *mnt_userns,
-                       struct inode *vdir, struct dentry *dentry, umode_t mode)
+                       struct inode *vdir, struct file *file, umode_t mode)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+               __bch2_create(mnt_userns, to_bch_ei(vdir),
+                             file->f_path.dentry, mode, 0,
                              (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
        if (IS_ERR(inode))
                return bch2_err_class(PTR_ERR(inode));
 
-       d_mark_tmpfile(dentry, &inode->v);
-       d_instantiate(dentry, &inode->v);
-       return 0;
+       d_mark_tmpfile(file, &inode->v);
+       d_instantiate(file->f_path.dentry, &inode->v);
+       return finish_open_simple(file, 0);
 }
 
 static int bch2_fill_extent(struct bch_fs *c,
index 6b91bbe911162466063d00e4ec41889003d28ced..783e77c321f9ffddbc88a4e0a5a1df2e53e39b63 100644 (file)
@@ -18,7 +18,6 @@ struct bch_inode_info {
        struct mutex            ei_update_lock;
        u64                     ei_quota_reserved;
        unsigned long           ei_last_dirtied;
-
        two_state_lock_t        ei_pagecache_lock;
 
        struct mutex            ei_quota_lock;
index 3b71eedb05a4033fcb6fde81796cb773ed145bff..5887d78190eb971176276ef31dea635ba05baa00 100644 (file)
@@ -817,7 +817,7 @@ static int hash_check_key(struct btree_trans *trans,
                goto bad_hash;
 
        for_each_btree_key_norestart(trans, iter, desc.btree_id,
-                                    POS(hash_k.k->p.inode, hash),
+                                    SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
                                     BTREE_ITER_SLOTS, k, ret) {
                if (bkey_eq(k.k->p, hash_k.k->p))
                        break;
index 585d16ac2da21acd34ebf42cbf5af039ebe56351..560545a7ea0399426b0f80cec5e1ebe977479074 100644 (file)
@@ -433,7 +433,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 }
 
 int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                      int rw, struct printbuf *err)
+                      unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
@@ -453,7 +453,7 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
@@ -473,7 +473,7 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 
@@ -536,7 +536,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                 int rw, struct printbuf *err)
+                                 unsigned flags, struct printbuf *err)
 {
        if (k.k->p.inode) {
                prt_printf(err, "nonzero k.p.inode");
@@ -663,19 +663,8 @@ again:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_lt(k.k->p, POS(0, max))) {
-               while (pos < iter->pos.offset) {
-                       if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
-                               goto found_slot;
-
-                       pos++;
-               }
-
-               if (k.k->p.snapshot == snapshot &&
-                   !bkey_is_inode(k.k) &&
-                   !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
-                       bch2_btree_iter_advance(iter);
-                       continue;
-               }
+               if (pos < iter->pos.offset)
+                       goto found_slot;
 
                /*
                 * We don't need to iterate over keys in every snapshot once
@@ -685,12 +674,8 @@ again:
                bch2_btree_iter_set_pos(iter, POS(0, pos));
        }
 
-       while (!ret && pos < max) {
-               if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
-                       goto found_slot;
-
-               pos++;
-       }
+       if (!ret && pos < max)
+               goto found_slot;
 
        if (!ret && start == min)
                ret = -BCH_ERR_ENOSPC_inode_create;
@@ -713,11 +698,6 @@ found_slot:
                return ret;
        }
 
-       /* We may have raced while the iterator wasn't pointing at pos: */
-       if (bkey_is_inode(k.k) ||
-           bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
-               goto again;
-
        *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
@@ -734,11 +714,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
        int ret = 0;
 
        /*
-        * We're never going to be deleting extents, no need to use an extent
-        * iterator:
+        * We're never going to be deleting partial extents, no need to use an
+        * extent iterator:
         */
        bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-                            BTREE_ITER_INTENT);
+                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
        while (1) {
                bch2_trans_begin(trans);
@@ -760,14 +740,6 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
                bkey_init(&delete.k);
                delete.k.p = iter.pos;
 
-               if (iter.flags & BTREE_ITER_IS_EXTENTS) {
-                       bch2_key_resize(&delete.k, k.k->p.offset - iter.pos.offset);
-
-                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-                       if (ret)
-                               goto err;
-               }
-
                ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
                      bch2_trans_commit(trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
@@ -823,8 +795,8 @@ retry:
 
        if (!bkey_is_inode(k.k)) {
                bch2_fs_inconsistent(trans.c,
-                                    "inode %llu not found when deleting",
-                                    inum.inum);
+                                    "inode %llu:%u not found when deleting",
+                                    inum.inum, snapshot);
                ret = -EIO;
                goto err;
        }
index b753e1b254e4679c932f0f96a0e5f1c039e92b72..f5066afb4886c5e67a3fd426020f74596e3b257f 100644 (file)
@@ -7,9 +7,9 @@
 
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode ((struct bkey_ops) {       \
@@ -41,7 +41,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
-                                 int, struct printbuf *);
+                                 unsigned, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {    \
index d215973ae73b3d9cdf5353509b1c534de0db9232..c0c33f788d44db01c84094e9346994fe18fbf1dc 100644 (file)
@@ -34,6 +34,7 @@
 #include "super-io.h"
 
 #include <linux/blkdev.h>
+#include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
@@ -46,6 +47,8 @@ const char *bch2_blk_status_to_str(blk_status_t status)
        return blk_status_to_str(status);
 }
 
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
 {
        const struct bch_devs_mask *devs;
@@ -134,6 +137,15 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
        __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       return false;
+}
+
+#endif
+
 /* Allocate, free from mempool: */
 
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
@@ -242,6 +254,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_i *k;
        struct bkey_i_inode_v3 *inode;
+       unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
        int ret;
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
@@ -264,15 +277,24 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
        inode = bkey_i_to_inode_v3(k);
 
        if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
-           new_i_size > le64_to_cpu(inode->v.bi_size))
+           new_i_size > le64_to_cpu(inode->v.bi_size)) {
                inode->v.bi_size = cpu_to_le64(new_i_size);
+               inode_update_flags = 0;
+       }
 
-       le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+       if (i_sectors_delta) {
+               le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+               inode_update_flags = 0;
+       }
 
-       inode->k.p.snapshot = iter.snapshot;
+       if (inode->k.p.snapshot != iter.snapshot) {
+               inode->k.p.snapshot = iter.snapshot;
+               inode_update_flags = 0;
+       }
 
        ret = bch2_trans_update(trans, &iter, &inode->k_i,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                               inode_update_flags);
 err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -513,8 +535,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
                if (ret)
                        continue;
 
-               BUG_ON(bkey_ge(iter->pos, end_pos));
-
                bkey_init(&delete.k);
                delete.k.p = iter->pos;
 
@@ -527,8 +547,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
                bch2_disk_reservation_put(c, &disk_res);
        }
 
-       BUG_ON(bkey_gt(iter->pos, end_pos));
-
        return ret ?: ret2;
 }
 
@@ -665,6 +683,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                                     bio_sectors(&n->bio));
 
                        bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+                       if (IS_ENABLED(CONFIG_BCACHEFS_NO_IO) && type != BCH_DATA_btree) {
+                               bio_endio(&n->bio);
+                               continue;
+                       }
+
                        submit_bio(&n->bio);
                } else {
                        n->bio.bi_status        = BLK_STS_REMOVED;
@@ -681,11 +705,12 @@ static void bch2_write_done(struct closure *cl)
        struct bch_fs *c = op->c;
 
        bch2_disk_reservation_put(c, &op->res);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_write);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
+       EBUG_ON(cl->parent);
        closure_debug_destroy(cl);
        if (op->end_io)
                op->end_io(op);
@@ -780,6 +805,30 @@ err:
        goto out;
 }
 
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+       if (state != wp->state) {
+               u64 now = ktime_get_ns();
+
+               if (wp->last_state_change &&
+                   time_after64(now, wp->last_state_change))
+                       wp->time[wp->state] += now - wp->last_state_change;
+               wp->state = state;
+               wp->last_state_change = now;
+       }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+       enum write_point_state state;
+
+       state = running                  ? WRITE_POINT_running :
+               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+                                        : WRITE_POINT_stopped;
+
+       __wp_update_state(wp, state);
+}
+
 static void bch2_write_index(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
@@ -787,6 +836,16 @@ static void bch2_write_index(struct closure *cl)
        struct workqueue_struct *wq = index_update_wq(op);
 
        barrier();
+
+       /*
+        * We're not using wp->writes_lock here, so this is racey: that's ok,
+        * because this is just for diagnostic purposes, and we're running out
+        * of interrupt context here so if we were to take the log we'd have to
+        * switch to spin_lock_irq()/irqsave(), which is not free:
+        */
+       if (wp->state == WRITE_POINT_waiting_io)
+               __wp_update_state(wp, WRITE_POINT_waiting_work);
+
        op->btree_update_ready = true;
        queue_work(wq, &wp->index_update_work);
 }
@@ -799,16 +858,21 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
 
        while (1) {
                spin_lock(&wp->writes_lock);
-               op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
-               if (op && !op->btree_update_ready)
-                       op = NULL;
-               if (op)
-                       list_del(&op->wp_list);
+               list_for_each_entry(op, &wp->writes, wp_list)
+                       if (op->btree_update_ready) {
+                               list_del(&op->wp_list);
+                               goto unlock;
+                       }
+               op = NULL;
+unlock:
+               wp_update_state(wp, op != NULL);
                spin_unlock(&wp->writes_lock);
 
                if (!op)
                        break;
 
+               op->flags |= BCH_WRITE_IN_WORKER;
+
                __bch2_write_index(op);
 
                if (!(op->flags & BCH_WRITE_DONE))
@@ -850,12 +914,10 @@ static void bch2_write_endio(struct bio *bio)
        if (wbio->put_bio)
                bio_put(bio);
 
-       if (parent) {
+       if (parent)
                bio_endio(&parent->bio);
-               return;
-       }
-
-       closure_put(cl);
+       else
+               closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -863,7 +925,6 @@ static void init_append_extent(struct bch_write_op *op,
                               struct bversion version,
                               struct bch_extent_crc_unpacked crc)
 {
-       struct bch_fs *c = op->c;
        struct bkey_i_extent *e;
 
        op->pos.offset += crc.uncompressed_size;
@@ -878,7 +939,7 @@ static void init_append_extent(struct bch_write_op *op,
            crc.nonce)
                bch2_extent_crc_append(&e->k_i, crc);
 
-       bch2_alloc_sectors_append_ptrs_inlined(c, wp, &e->k_i, crc.compressed_size,
+       bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
                                       op->flags & BCH_WRITE_CACHED);
 
        bch2_keylist_push(&op->insert_keys);
@@ -1360,8 +1421,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
                                     bkey_start_pos(&orig->k), orig->k.p,
                                     BTREE_ITER_INTENT, k,
                                     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-                       BUG_ON(bkey_ge(bkey_start_pos(k.k), orig->k.p));
-
                        bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
                }));
 
@@ -1641,10 +1700,11 @@ again:
                        goto err;
                }
 
+               EBUG_ON(!wp);
+
+               bch2_open_bucket_get(c, wp, &op->open_buckets);
                ret = bch2_write_extent(op, wp, &bio);
 
-               if (ret >= 0)
-                       bch2_open_bucket_get(c, wp, &op->open_buckets);
                bch2_alloc_sectors_done_inlined(c, wp);
 err:
                if (ret <= 0) {
@@ -1652,6 +1712,8 @@ err:
                                spin_lock(&wp->writes_lock);
                                op->wp = wp;
                                list_add_tail(&op->wp_list, &wp->writes);
+                               if (wp->state == WRITE_POINT_stopped)
+                                       __wp_update_state(wp, WRITE_POINT_waiting_io);
                                spin_unlock(&wp->writes_lock);
                        }
 
@@ -1683,7 +1745,9 @@ err:
         * synchronously here if we weren't able to submit all of the IO at
         * once, as that signals backpressure to the caller.
         */
-       if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
+       if ((op->flags & BCH_WRITE_SYNC) ||
+           (!(op->flags & BCH_WRITE_DONE) &&
+            !(op->flags & BCH_WRITE_IN_WORKER))) {
                closure_sync(&op->cl);
                __bch2_write_index(op);
 
@@ -1705,6 +1769,9 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
        unsigned sectors;
        int ret;
 
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       op->flags |= BCH_WRITE_DONE;
+
        bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
        ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1732,9 +1799,6 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
        set_bkey_val_bytes(&id->k, data_len);
        bch2_keylist_push(&op->insert_keys);
 
-       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-       op->flags |= BCH_WRITE_DONE;
-
        __bch2_write_index(op);
 err:
        bch2_write_done(&op->cl);
@@ -1782,7 +1846,7 @@ void bch2_write(struct closure *cl)
        }
 
        if (c->opts.nochanges ||
-           !percpu_ref_tryget_live(&c->writes)) {
+           !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
                op->error = -BCH_ERR_erofs_no_writes;
                goto err;
        }
@@ -1861,10 +1925,12 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 {
        int ret;
 
+       bch2_data_update_exit(&op->write);
+
        ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
                                     bch_promote_params);
        BUG_ON(ret);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
        kfree_rcu(op, rcu);
 }
 
@@ -1876,8 +1942,6 @@ static void promote_done(struct bch_write_op *wop)
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
                               op->start_time);
-
-       bch2_data_update_exit(&op->write);
        promote_free(c, op);
 }
 
@@ -1898,7 +1962,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
        bch2_data_update_read_done(&op->write, rbio->pick.crc);
 }
 
-static struct promote_op *__promote_alloc(struct bch_fs *c,
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
                                          enum btree_id btree_id,
                                          struct bkey_s_c k,
                                          struct bpos pos,
@@ -1907,12 +1971,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                                          unsigned sectors,
                                          struct bch_read_bio **rbio)
 {
+       struct bch_fs *c = trans->c;
        struct promote_op *op = NULL;
        struct bio *bio;
        unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
        int ret;
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
                return NULL;
 
        op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
@@ -1950,7 +2015,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
        bio = &op->write.op.wbio.bio;
        bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-       ret = bch2_data_update_init(c, &op->write,
+       ret = bch2_data_update_init(trans, NULL, &op->write,
                        writepoint_hashed((unsigned long) current),
                        opts,
                        (struct data_update_opts) {
@@ -1959,6 +2024,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                                .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
                        },
                        btree_id, k);
+       if (ret == -BCH_ERR_nocow_lock_blocked) {
+               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                       bch_promote_params);
+               BUG_ON(ret);
+               goto err;
+       }
+
        BUG_ON(ret);
        op->write.op.end_io = promote_done;
 
@@ -1969,21 +2041,22 @@ err:
        kfree(*rbio);
        *rbio = NULL;
        kfree(op);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
        return NULL;
 }
 
 noinline
-static struct promote_op *promote_alloc(struct bch_fs *c,
-                                              struct bvec_iter iter,
-                                              struct bkey_s_c k,
-                                              struct extent_ptr_decoded *pick,
-                                              struct bch_io_opts opts,
-                                              unsigned flags,
-                                              struct bch_read_bio **rbio,
-                                              bool *bounce,
-                                              bool *read_full)
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_ptr_decoded *pick,
+                                       struct bch_io_opts opts,
+                                       unsigned flags,
+                                       struct bch_read_bio **rbio,
+                                       bool *bounce,
+                                       bool *read_full)
 {
+       struct bch_fs *c = trans->c;
        bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
        /* data might have to be decompressed in the write path: */
        unsigned sectors = promote_full
@@ -1997,7 +2070,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
        if (!should_promote(c, k, pos, opts, flags))
                return NULL;
 
-       promote = __promote_alloc(c,
+       promote = __promote_alloc(trans,
                                  k.k->type == KEY_TYPE_reflink_v
                                  ? BTREE_ID_reflink
                                  : BTREE_ID_extents,
@@ -2283,7 +2356,8 @@ static void __bch2_read_endio(struct work_struct *work)
        }
 
        csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-       if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) &&
+           !IS_ENABLED(CONFIG_BCACHEFS_NO_IO))
                goto csum_err;
 
        /*
@@ -2604,7 +2678,7 @@ retry_pick:
        }
 
        if (orig->opts.promote_target)
-               promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
                                        &rbio, &bounce, &read_full);
 
        if (!read_full) {
@@ -2734,10 +2808,21 @@ get_bio:
                             bio_sectors(&rbio->bio));
                bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
 
-               if (likely(!(flags & BCH_READ_IN_RETRY)))
-                       submit_bio(&rbio->bio);
-               else
-                       submit_bio_wait(&rbio->bio);
+               if (IS_ENABLED(CONFIG_BCACHEFS_NO_IO)) {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               bio_endio(&rbio->bio);
+               } else {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               submit_bio(&rbio->bio);
+                       else
+                               submit_bio_wait(&rbio->bio);
+               }
+
+               /*
+                * We just submitted IO which may block, we expect relock fail
+                * events and shouldn't count them:
+                */
+               trans->notrace_relock_fail = true;
        } else {
                /* Attempting reconstruct read: */
                if (bch2_ec_read_extent(c, rbio)) {
index 68e4d7642d4ee38fdca2e35fc0af46c295e29b7e..166ad68177400176b9fa04ae8a23d94934b12941 100644 (file)
 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
 
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
                               enum bch_data_type, const struct bkey_i *, bool);
@@ -25,23 +29,41 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 const char *bch2_blk_status_to_str(blk_status_t);
 
 enum bch_write_flags {
-       BCH_WRITE_ALLOC_NOWAIT          = (1 << 0),
-       BCH_WRITE_CACHED                = (1 << 1),
-       BCH_WRITE_DATA_ENCODED          = (1 << 2),
-       BCH_WRITE_PAGES_STABLE          = (1 << 3),
-       BCH_WRITE_PAGES_OWNED           = (1 << 4),
-       BCH_WRITE_ONLY_SPECIFIED_DEVS   = (1 << 5),
-       BCH_WRITE_WROTE_DATA_INLINE     = (1 << 6),
-       BCH_WRITE_CHECK_ENOSPC          = (1 << 7),
-       BCH_WRITE_SYNC                  = (1 << 8),
-       BCH_WRITE_MOVE                  = (1 << 9),
-
-       /* Internal: */
-       BCH_WRITE_DONE                  = (1 << 10),
-       BCH_WRITE_IO_ERROR              = (1 << 11),
-       BCH_WRITE_CONVERT_UNWRITTEN     = (1 << 12),
+       __BCH_WRITE_ALLOC_NOWAIT,
+       __BCH_WRITE_CACHED,
+       __BCH_WRITE_DATA_ENCODED,
+       __BCH_WRITE_PAGES_STABLE,
+       __BCH_WRITE_PAGES_OWNED,
+       __BCH_WRITE_ONLY_SPECIFIED_DEVS,
+       __BCH_WRITE_WROTE_DATA_INLINE,
+       __BCH_WRITE_FROM_INTERNAL,
+       __BCH_WRITE_CHECK_ENOSPC,
+       __BCH_WRITE_SYNC,
+       __BCH_WRITE_MOVE,
+       __BCH_WRITE_IN_WORKER,
+       __BCH_WRITE_DONE,
+       __BCH_WRITE_IO_ERROR,
+       __BCH_WRITE_CONVERT_UNWRITTEN,
 };
 
+#define BCH_WRITE_ALLOC_NOWAIT         (1U << __BCH_WRITE_ALLOC_NOWAIT)
+#define BCH_WRITE_CACHED               (1U << __BCH_WRITE_CACHED)
+#define BCH_WRITE_DATA_ENCODED         (1U << __BCH_WRITE_DATA_ENCODED)
+#define BCH_WRITE_PAGES_STABLE         (1U << __BCH_WRITE_PAGES_STABLE)
+#define BCH_WRITE_PAGES_OWNED          (1U << __BCH_WRITE_PAGES_OWNED)
+#define BCH_WRITE_ONLY_SPECIFIED_DEVS  (1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS)
+#define BCH_WRITE_WROTE_DATA_INLINE    (1U << __BCH_WRITE_WROTE_DATA_INLINE)
+#define BCH_WRITE_FROM_INTERNAL                (1U << __BCH_WRITE_FROM_INTERNAL)
+#define BCH_WRITE_CHECK_ENOSPC         (1U << __BCH_WRITE_CHECK_ENOSPC)
+#define BCH_WRITE_SYNC                 (1U << __BCH_WRITE_SYNC)
+#define BCH_WRITE_MOVE                 (1U << __BCH_WRITE_MOVE)
+
+/* Internal: */
+#define BCH_WRITE_IN_WORKER            (1U << __BCH_WRITE_IN_WORKER)
+#define BCH_WRITE_DONE                 (1U << __BCH_WRITE_DONE)
+#define BCH_WRITE_IO_ERROR             (1U << __BCH_WRITE_IO_ERROR)
+#define BCH_WRITE_CONVERT_UNWRITTEN    (1U << __BCH_WRITE_CONVERT_UNWRITTEN)
+
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
        return op->alloc_reserve == RESERVE_movinggc
index 1cbca187cb15d2829cb1c8e09c694c4131f86c0d..6214a919da8072b7a06f8f92ff63462725b495e8 100644 (file)
@@ -225,7 +225,7 @@ static int journal_entry_open(struct journal *j)
        if (!fifo_free(&j->pin))
                return JOURNAL_ERR_journal_pin_full;
 
-       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
+       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
                return JOURNAL_ERR_max_in_flight;
 
        BUG_ON(!j->cur_entry_sectors);
index ee37f90aa6b369097170d85ea2b3fe35c69d56ea..024cea9f5902e7b0be7475d9b73b48d9b8b76956 100644 (file)
  */
 
 #include <linux/hash.h>
-#include <linux/prefetch.h>
 
 #include "journal_types.h"
 
@@ -305,26 +304,15 @@ static inline int journal_res_get_fast(struct journal *j,
 {
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
-       unsigned u64s, offset;
 
        do {
                old.v = new.v = v;
 
-               /*
-                * Round up the end of the journal reservation to the next
-                * cacheline boundary:
-                */
-               u64s = res->u64s;
-               offset = sizeof(struct jset) / sizeof(u64) +
-                         new.cur_entry_offset + u64s;
-               u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1;
-
-
                /*
                 * Check if there is still room in the current journal
                 * entry:
                 */
-               if (new.cur_entry_offset + u64s > j->cur_entry_u64s)
+               if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
                        return 0;
 
                EBUG_ON(!journal_state_count(new, new.idx));
@@ -332,7 +320,7 @@ static inline int journal_res_get_fast(struct journal *j,
                if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
                        return 0;
 
-               new.cur_entry_offset += u64s;
+               new.cur_entry_offset += res->u64s;
                journal_state_inc(&new);
 
                /*
@@ -349,15 +337,8 @@ static inline int journal_res_get_fast(struct journal *j,
 
        res->ref        = true;
        res->idx        = old.idx;
-       res->u64s       = u64s;
        res->offset     = old.cur_entry_offset;
        res->seq        = le64_to_cpu(j->buf[old.idx].data->seq);
-
-       offset = res->offset;
-       while (offset < res->offset + res->u64s) {
-               prefetchw(vstruct_idx(j->buf[res->idx].data, offset));
-               offset += SMP_CACHE_BYTES / sizeof(u64);
-       }
        return 1;
 }
 
index d6f259348b3dbb67eec2740a19b93b333765d038..f6374a2bdc83fe4353be990faab786a2c96ac6c8 100644 (file)
@@ -154,7 +154,7 @@ replace:
        i->nr_ptrs      = 0;
        i->csum_good    = entry_ptr.csum_good;
        i->ignore       = false;
-       memcpy(&i->j, j, bytes);
+       unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
        i->ptrs[i->nr_ptrs++] = entry_ptr;
 
        if (dup) {
@@ -341,7 +341,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
                int ret = journal_validate_key(c, jset, entry,
                                               entry->level,
                                               entry->btree_id,
-                                              k, version, big_endian, write);
+                                              k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL);
                if (ret == FSCK_DELETED_KEY)
                        continue;
 
@@ -662,7 +662,8 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
                                      struct jset_entry *entry,
                                      unsigned version, int big_endian, int write)
 {
-       return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
+       return journal_entry_btree_keys_validate(c, jset, entry,
+                               version, big_endian, READ);
 }
 
 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1498,6 +1499,8 @@ static void journal_write_done(struct closure *cl)
 
                        bch2_do_discards(c);
                        closure_wake_up(&c->freelist_wait);
+
+                       bch2_reset_alloc_cursors(c);
                }
        } else if (!j->err_seq || seq < j->err_seq)
                j->err_seq      = seq;
index e873ce2a3f03a5e9c2ba4d4cfc2ff87c30065ad2..8744581dfda7de7bad38aa65ad66b9aca27562a1 100644 (file)
@@ -347,13 +347,13 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
        }
 }
 
-static inline void __journal_pin_drop(struct journal *j,
+static inline bool __journal_pin_drop(struct journal *j,
                                      struct journal_entry_pin *pin)
 {
        struct journal_entry_pin_list *pin_list;
 
        if (!journal_pin_active(pin))
-               return;
+               return false;
 
        if (j->flush_in_progress == pin)
                j->flush_in_progress_dropped = true;
@@ -363,19 +363,19 @@ static inline void __journal_pin_drop(struct journal *j,
        list_del_init(&pin->list);
 
        /*
-        * Unpinning a journal entry may make journal_next_bucket() succeed if
+        * Unpinning a journal entry make make journal_next_bucket() succeed, if
         * writing a new last_seq will now make another bucket available:
         */
-       if (atomic_dec_and_test(&pin_list->count) &&
-           pin_list == &fifo_peek_front(&j->pin))
-               bch2_journal_reclaim_fast(j);
+       return atomic_dec_and_test(&pin_list->count) &&
+               pin_list == &fifo_peek_front(&j->pin);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
                           struct journal_entry_pin *pin)
 {
        spin_lock(&j->lock);
-       __journal_pin_drop(j, pin);
+       if (__journal_pin_drop(j, pin))
+               bch2_journal_reclaim_fast(j);
        spin_unlock(&j->lock);
 }
 
@@ -384,6 +384,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
                          journal_pin_flush_fn flush_fn)
 {
        struct journal_entry_pin_list *pin_list;
+       bool reclaim;
 
        spin_lock(&j->lock);
 
@@ -400,7 +401,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
        pin_list = journal_seq_pin(j, seq);
 
-       __journal_pin_drop(j, pin);
+       reclaim = __journal_pin_drop(j, pin);
 
        atomic_inc(&pin_list->count);
        pin->seq        = seq;
@@ -412,6 +413,9 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
                list_add(&pin->list, &pin_list->list);
        else
                list_add(&pin->list, &pin_list->flushed);
+
+       if (reclaim)
+               bch2_journal_reclaim_fast(j);
        spin_unlock(&j->lock);
 
        /*
@@ -703,7 +707,7 @@ static int bch2_journal_reclaim_thread(void *arg)
                        j->next_reclaim = now + delay;
 
                while (1) {
-                       set_current_state(TASK_INTERRUPTIBLE);
+                       set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                        if (kthread_should_stop())
                                break;
                        if (j->reclaim_kicked)
@@ -714,9 +718,9 @@ static int bch2_journal_reclaim_thread(void *arg)
                        spin_unlock(&j->lock);
 
                        if (journal_empty)
-                               freezable_schedule();
+                               schedule();
                        else if (time_after(j->next_reclaim, jiffies))
-                               freezable_schedule_timeout(j->next_reclaim - jiffies);
+                               schedule_timeout(j->next_reclaim - jiffies);
                        else
                                break;
                }
index c8729cb37f06808ecec9588ad9a159db124281f2..0e6bde669b3e6577b542a4243a11a061693ffc8c 100644 (file)
@@ -182,29 +182,32 @@ typedef DARRAY(u64)               darray_u64;
 /* Embedded in struct bch_fs */
 struct journal {
        /* Fastpath stuff up front: */
-
-       unsigned long           flags;
+       struct {
 
        union journal_res_state reservations;
        enum journal_watermark  watermark;
 
+       union journal_preres_state prereserved;
+
+       } __aligned(SMP_CACHE_BYTES);
+
+       unsigned long           flags;
+
        /* Max size of current journal entry */
        unsigned                cur_entry_u64s;
        unsigned                cur_entry_sectors;
 
+       /* Reserved space in journal entry to be used just prior to write */
+       unsigned                entry_u64s_reserved;
+
+
        /*
         * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
         * insufficient devices:
         */
        enum journal_errors     cur_entry_error;
 
-       union journal_preres_state prereserved;
-
-       /* Reserved space in journal entry to be used just prior to write */
-       unsigned                entry_u64s_reserved;
-
        unsigned                buf_size_want;
-
        /*
         * We may queue up some things to be journalled (log messages) before
         * the journal has actually started - stash them here:
@@ -298,15 +301,15 @@ struct journal {
        u64                     nr_flush_writes;
        u64                     nr_noflush_writes;
 
-       struct time_stats       *flush_write_time;
-       struct time_stats       *noflush_write_time;
-       struct time_stats       *blocked_time;
-       struct time_stats       *flush_seq_time;
+       struct bch2_time_stats  *flush_write_time;
+       struct bch2_time_stats  *noflush_write_time;
+       struct bch2_time_stats  *blocked_time;
+       struct bch2_time_stats  *flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map      res_map;
 #endif
-};
+} __aligned(SMP_CACHE_BYTES);
 
 /*
  * Embedded in struct bch_dev. First three fields refer to the array of journal
index e542cd3d4ff280d55fc8f2a20e0235d75c3f555b..07fb41ca8c6b4e2e7d12d8cb4948ba499039fc77 100644 (file)
@@ -10,7 +10,7 @@
 
 /* KEY_TYPE_lru is obsolete: */
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                    int rw, struct printbuf *err)
+                    unsigned flags, struct printbuf *err)
 {
        const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
@@ -20,6 +20,12 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
+       if (!lru_pos_time(k.k->p)) {
+               prt_printf(err, "lru entry at time=0");
+               return -BCH_ERR_invalid_bkey;
+
+       }
+
        return 0;
 }
 
@@ -31,6 +37,15 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
        prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+       prt_printf(out, "%llu:%llu -> %llu:%llu",
+                  lru_pos_id(lru),
+                  lru_pos_time(lru),
+                  u64_to_bucket(lru.offset).inode,
+                  u64_to_bucket(lru.offset).offset);
+}
+
 static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
                        u64 dev_bucket, u64 time, unsigned key_type)
 {
index 2e22f139848ae36a21667f922140a7ad6e84c7d7..b8d9848cdb1acc030a84cc950b92f5504354eac1 100644 (file)
@@ -22,9 +22,11 @@ static inline u64 lru_pos_time(struct bpos pos)
        return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
 #define bch2_bkey_ops_lru ((struct bkey_ops) { \
        .key_invalid    = bch2_lru_invalid,     \
        .val_to_text    = bch2_lru_to_text,     \
index b308354aa5e3807971054a5e17b4b603fd8c60d8..8321563d018722b00e35fda24176f40b41e61cc4 100644 (file)
@@ -61,7 +61,7 @@ static void move_free(struct moving_io *io)
 
        bch2_data_update_exit(&io->write);
        wake_up(&ctxt->wait);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_move);
        kfree(io);
 }
 
@@ -74,6 +74,7 @@ static void move_write_done(struct bch_write_op *op)
                ctxt->write_error = true;
 
        atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+       atomic_dec(&io->write.ctxt->write_ios);
        move_free(io);
        closure_put(&ctxt->cl);
 }
@@ -87,11 +88,12 @@ static void move_write(struct moving_io *io)
 
        closure_get(&io->write.ctxt->cl);
        atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+       atomic_inc(&io->write.ctxt->write_ios);
 
        bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 }
 
-static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
        struct moving_io *io =
                list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
@@ -105,35 +107,27 @@ static void move_read_endio(struct bio *bio)
        struct moving_context *ctxt = io->write.ctxt;
 
        atomic_sub(io->read_sectors, &ctxt->read_sectors);
+       atomic_dec(&ctxt->read_ios);
        io->read_completed = true;
 
        wake_up(&ctxt->wait);
        closure_put(&ctxt->cl);
 }
 
-static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
+                                       struct btree_trans *trans)
 {
        struct moving_io *io;
 
        if (trans)
                bch2_trans_unlock(trans);
 
-       while ((io = next_pending_write(ctxt))) {
+       while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
                list_del(&io->list);
                move_write(io);
        }
 }
 
-#define move_ctxt_wait_event(_ctxt, _trans, _cond)             \
-do {                                                           \
-       do_pending_writes(_ctxt, _trans);                       \
-                                                               \
-       if (_cond)                                              \
-               break;                                          \
-       __wait_event((_ctxt)->wait,                             \
-                    next_pending_write(_ctxt) || (_cond));     \
-} while (1)
-
 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
                                       struct btree_trans *trans)
 {
@@ -148,7 +142,11 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
        move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
        closure_sync(&ctxt->cl);
+
        EBUG_ON(atomic_read(&ctxt->write_sectors));
+       EBUG_ON(atomic_read(&ctxt->write_ios));
+       EBUG_ON(atomic_read(&ctxt->read_sectors));
+       EBUG_ON(atomic_read(&ctxt->read_ios));
 
        if (ctxt->stats) {
                progress_list_del(ctxt->c, ctxt->stats);
@@ -257,7 +255,7 @@ static int bch2_move_extent(struct btree_trans *trans,
                return 0;
        }
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
                return -BCH_ERR_erofs_no_writes;
 
        /*
@@ -299,8 +297,8 @@ static int bch2_move_extent(struct btree_trans *trans,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
-                                   data_opts, btree_id, k);
+       ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
+                                   io_opts, data_opts, btree_id, k);
        if (ret && ret != -BCH_ERR_unwritten_extent_update)
                goto err_free_pages;
 
@@ -323,6 +321,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        trace_move_extent_read(k.k);
 
        atomic_add(io->read_sectors, &ctxt->read_sectors);
+       atomic_inc(&ctxt->read_ios);
        list_add_tail(&io->list, &ctxt->reads);
 
        /*
@@ -341,7 +340,7 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_move);
        trace_and_count(c, move_extent_alloc_mem_fail, k.k);
        return ret;
 }
@@ -412,13 +411,15 @@ static int move_ratelimit(struct btree_trans *trans,
                }
        } while (delay);
 
+       /*
+        * XXX: these limits really ought to be per device, SSDs and hard drives
+        * will want different limits
+        */
        move_ctxt_wait_event(ctxt, trans,
-               atomic_read(&ctxt->write_sectors) <
-               c->opts.move_bytes_in_flight >> 9);
-
-       move_ctxt_wait_event(ctxt, trans,
-               atomic_read(&ctxt->read_sectors) <
-               c->opts.move_bytes_in_flight >> 9);
+               atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+               atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+               atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+               atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
 
        return 0;
 }
index b14f679f6904eb80bf1dd4d5079b515b8e2ccb3a..aef613802935e90958a9e3d4659b7d94e57773df 100644 (file)
@@ -24,10 +24,26 @@ struct moving_context {
        /* in flight sectors: */
        atomic_t                read_sectors;
        atomic_t                write_sectors;
+       atomic_t                read_ios;
+       atomic_t                write_ios;
 
        wait_queue_head_t       wait;
 };
 
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)                     \
+do {                                                                   \
+       bool cond_finished = false;                                     \
+       bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);              \
+                                                                       \
+       if (_cond)                                                      \
+               break;                                                  \
+       __wait_event((_ctxt)->wait,                                     \
+                    bch2_moving_ctxt_next_pending_write(_ctxt) ||      \
+                    (cond_finished = (_cond)));                        \
+       if (cond_finished)                                              \
+               break;                                                  \
+} while (1)
+
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
                             struct bch_io_opts *, struct data_update_opts *);
 
@@ -35,6 +51,9 @@ void bch2_moving_ctxt_exit(struct moving_context *);
 void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
                           struct bch_ratelimit *, struct bch_move_stats *,
                           struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
+                                       struct btree_trans *);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
index bff6267158cc3f6d30bbb3b1ac141d28b8379294..396357cd8f2fe715c278bfe486799e6fe039e335 100644 (file)
@@ -18,6 +18,8 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos
        return false;
 }
 
+#define sign(v)                (v < 0 ? -1 : v > 0 ? 1 : 0)
+
 void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
 {
        u64 dev_bucket = bucket_to_u64(bucket);
@@ -27,6 +29,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
 
        for (i = 0; i < ARRAY_SIZE(l->b); i++)
                if (l->b[i] == dev_bucket) {
+                       BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+
                        if (!atomic_sub_return(lock_val, &l->l[i]))
                                closure_wake_up(&l->wait);
                        return;
@@ -35,8 +39,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
        BUG();
 }
 
-static bool bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
-                                     u64 dev_bucket, int flags)
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+                                u64 dev_bucket, int flags)
 {
        int v, lock_val = flags ? 1 : -1;
        unsigned i;
@@ -69,11 +73,11 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
                              struct nocow_lock_bucket *l,
                              u64 dev_bucket, int flags)
 {
-       if (!bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+       if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
                struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
                u64 start_time = local_clock();
 
-               __closure_wait_event(&l->wait, bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+               __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
                bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
        }
 }
index 45258cc34614af8c86e047d3f910489c2484d9fa..ff8e4af52edcd95fefc0b3164870280ff58fd2b7 100644 (file)
@@ -20,6 +20,7 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo
 
 bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
 void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
 void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
                              struct nocow_lock_bucket *, u64, int);
 
@@ -32,6 +33,15 @@ static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
        __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
 }
 
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+                                         struct bpos bucket, int flags)
+{
+       u64 dev_bucket = bucket_to_u64(bucket);
+       struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+       return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
 void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
 
 int bch2_fs_nocow_locking_init(struct bch_fs *);
index 555e63730a968ebfb6431671d1861fb01f26ffe7..04e2989cd6b392703592c134b211eefce3d26220 100644 (file)
@@ -9,8 +9,6 @@
 #include "super-io.h"
 #include "util.h"
 
-#include <linux/pretty-printers.h>
-
 #define x(t, n) [n] = #t,
 
 const char * const bch2_metadata_versions[] = {
@@ -284,7 +282,7 @@ int bch2_opt_parse(struct bch_fs *c,
                if (ret < 0) {
                        if (err)
                                prt_printf(err, "%s: must be a number",
-                                      opt->attr.name);
+                                          opt->attr.name);
                        return ret;
                }
                break;
@@ -293,7 +291,7 @@ int bch2_opt_parse(struct bch_fs *c,
                if (ret < 0) {
                        if (err)
                                prt_printf(err, "%s: invalid selection",
-                                      opt->attr.name);
+                                          opt->attr.name);
                        return ret;
                }
 
@@ -307,7 +305,7 @@ int bch2_opt_parse(struct bch_fs *c,
                if (ret < 0) {
                        if (err)
                                prt_printf(err, "%s: parse error",
-                                      opt->attr.name);
+                                          opt->attr.name);
                        return ret;
                }
        }
index ee3055cf8200d191d726bdbed223ad2d85d6c834..9d4bbec7e6820ec7ea3cd021d766a8c6d53719e5 100644 (file)
@@ -294,7 +294,12 @@ enum opt_type {
          OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,              \
          OPT_UINT(1024, U32_MAX),                                      \
          BCH2_NO_SB_OPT,               1U << 20,                       \
-         NULL,         "Amount of IO in flight to keep in flight by the move path")\
+         NULL,         "Maximum Amount of IO to keep in flight by the move path")\
+       x(move_ios_in_flight,           u32,                            \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
+         OPT_UINT(1, 1024),                                            \
+         BCH2_NO_SB_OPT,               32,                             \
+         NULL,         "Maximum number of IOs to keep in flight by the move path")\
        x(fsck,                         u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
@@ -336,6 +341,11 @@ enum opt_type {
          OPT_BOOL(),                                                   \
          BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Only read the journal, skip the rest of recovery")\
+       x(journal_transaction_names,    u8,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+         OPT_BOOL(),                                                   \
+         BCH_SB_JOURNAL_TRANSACTION_NAMES, true,                       \
+         NULL,         "Log transaction function names in journal")    \
        x(noexcl,                       u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
similarity index 72%
rename from linux/printbuf.c
rename to libbcachefs/printbuf.c
index 5cf79d43f5a4eee2244c9282ca87fffca5ad904e..c41daa1806821198ad4f640f6dc508841718b51b 100644 (file)
@@ -4,16 +4,17 @@
 #include <linux/err.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
-#include <linux/printbuf.h>
 #include <linux/slab.h>
 #include <linux/string_helpers.h>
 
+#include "printbuf.h"
+
 static inline unsigned printbuf_linelen(struct printbuf *buf)
 {
        return buf->pos - buf->last_newline;
 }
 
-int printbuf_make_room(struct printbuf *out, unsigned extra)
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
 {
        unsigned new_size;
        char *buf;
@@ -44,13 +45,46 @@ int printbuf_make_room(struct printbuf *out, unsigned extra)
        out->size       = new_size;
        return 0;
 }
-EXPORT_SYMBOL(printbuf_make_room);
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+       int len;
+
+       do {
+               va_list args2;
+
+               va_copy(args2, args);
+               len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+       } while (len + 1 >= printbuf_remaining(out) &&
+                !bch2_printbuf_make_room(out, len + 1));
+
+       len = min_t(size_t, len,
+                 printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+       out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+       va_list args;
+       int len;
+
+       do {
+               va_start(args, fmt);
+               len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+               va_end(args);
+       } while (len + 1 >= printbuf_remaining(out) &&
+                !bch2_printbuf_make_room(out, len + 1));
+
+       len = min_t(size_t, len,
+                 printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+       out->pos += len;
+}
 
 /**
  * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
  * terminated
  */
-const char *printbuf_str(const struct printbuf *buf)
+const char *bch2_printbuf_str(const struct printbuf *buf)
 {
        /*
         * If we've written to a printbuf then it's guaranteed to be a null
@@ -61,33 +95,29 @@ const char *printbuf_str(const struct printbuf *buf)
                ? buf->buf
                : "";
 }
-EXPORT_SYMBOL(printbuf_str);
 
 /**
  * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
  * against accidental use.
  */
-void printbuf_exit(struct printbuf *buf)
+void bch2_printbuf_exit(struct printbuf *buf)
 {
        if (buf->heap_allocated) {
                kfree(buf->buf);
                buf->buf = ERR_PTR(-EINTR); /* poison value */
        }
 }
-EXPORT_SYMBOL(printbuf_exit);
 
-void printbuf_tabstops_reset(struct printbuf *buf)
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
 {
        buf->nr_tabstops = 0;
 }
-EXPORT_SYMBOL(printbuf_tabstops_reset);
 
-void printbuf_tabstop_pop(struct printbuf *buf)
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
 {
        if (buf->nr_tabstops)
                --buf->nr_tabstops;
 }
-EXPORT_SYMBOL(printbuf_tabstop_pop);
 
 /*
  * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
@@ -99,7 +129,7 @@ EXPORT_SYMBOL(printbuf_tabstop_pop);
  * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
  * of line.
  */
-int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
 {
        unsigned prev_tabstop = buf->nr_tabstops
                ? buf->_tabstops[buf->nr_tabstops - 1]
@@ -112,7 +142,6 @@ int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
        buf->has_indent_or_tabstops = true;
        return 0;
 }
-EXPORT_SYMBOL(printbuf_tabstop_push);
 
 /**
  * printbuf_indent_add - add to the current indent level
@@ -123,7 +152,7 @@ EXPORT_SYMBOL(printbuf_tabstop_push);
  * Subsequent lines, and the current line if the output position is at the start
  * of the current line, will be indented by @spaces more spaces.
  */
-void printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 {
        if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
                spaces = 0;
@@ -133,7 +162,6 @@ void printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 
        buf->has_indent_or_tabstops = true;
 }
-EXPORT_SYMBOL(printbuf_indent_add);
 
 /**
  * printbuf_indent_sub - subtract from the current indent level
@@ -144,7 +172,7 @@ EXPORT_SYMBOL(printbuf_indent_add);
  * Subsequent lines, and the current line if the output position is at the start
  * of the current line, will be indented by @spaces less spaces.
  */
-void printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
 {
        if (WARN_ON_ONCE(spaces > buf->indent))
                spaces = buf->indent;
@@ -158,13 +186,12 @@ void printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
        if (!buf->indent && !buf->nr_tabstops)
                buf->has_indent_or_tabstops = false;
 }
-EXPORT_SYMBOL(printbuf_indent_sub);
 
-void prt_newline(struct printbuf *buf)
+void bch2_prt_newline(struct printbuf *buf)
 {
        unsigned i;
 
-       printbuf_make_room(buf, 1 + buf->indent);
+       bch2_printbuf_make_room(buf, 1 + buf->indent);
 
        __prt_char(buf, '\n');
 
@@ -178,7 +205,6 @@ void prt_newline(struct printbuf *buf)
        buf->last_field         = buf->pos;
        buf->cur_tabstop        = 0;
 }
-EXPORT_SYMBOL(prt_newline);
 
 /*
  * Returns spaces from start of line, if set, or 0 if unset:
@@ -207,14 +233,13 @@ static void __prt_tab(struct printbuf *out)
  *
  * Advance output to the next tabstop by printing spaces.
  */
-void prt_tab(struct printbuf *out)
+void bch2_prt_tab(struct printbuf *out)
 {
        if (WARN_ON(!cur_tabstop(out)))
                return;
 
        __prt_tab(out);
 }
-EXPORT_SYMBOL(prt_tab);
 
 static void __prt_tab_rjust(struct printbuf *buf)
 {
@@ -222,7 +247,7 @@ static void __prt_tab_rjust(struct printbuf *buf)
        int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
 
        if (pad > 0) {
-               printbuf_make_room(buf, pad);
+               bch2_printbuf_make_room(buf, pad);
 
                if (buf->last_field + pad < buf->size)
                        memmove(buf->buf + buf->last_field + pad,
@@ -250,14 +275,13 @@ static void __prt_tab_rjust(struct printbuf *buf)
  * Advance output to the next tabstop by inserting spaces immediately after the
  * previous tabstop, right justifying previously outputted text.
  */
-void prt_tab_rjust(struct printbuf *buf)
+void bch2_prt_tab_rjust(struct printbuf *buf)
 {
        if (WARN_ON(!cur_tabstop(buf)))
                return;
 
        __prt_tab_rjust(buf);
 }
-EXPORT_SYMBOL(prt_tab_rjust);
 
 /**
  * prt_bytes_indented - Print an array of chars, handling embedded control characters
@@ -271,7 +295,7 @@ EXPORT_SYMBOL(prt_tab_rjust);
  *   \t: prt_tab       advance to next tabstop
  *   \r: prt_tab_rjust advance to next tabstop, with right justification
  */
-void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
 {
        const char *unprinted_start = str;
        const char *end = str + count;
@@ -286,7 +310,7 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
                case '\n':
                        prt_bytes(out, unprinted_start, str - unprinted_start);
                        unprinted_start = str + 1;
-                       prt_newline(out);
+                       bch2_prt_newline(out);
                        break;
                case '\t':
                        if (likely(cur_tabstop(out))) {
@@ -309,34 +333,31 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
 
        prt_bytes(out, unprinted_start, str - unprinted_start);
 }
-EXPORT_SYMBOL(prt_bytes_indented);
 
 /**
  * prt_human_readable_u64 - Print out a u64 in human readable units
  *
  * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
  */
-void prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
 {
-       printbuf_make_room(buf, 10);
+       bch2_printbuf_make_room(buf, 10);
        buf->pos += string_get_size(v, 1, !buf->si_units,
                                    buf->buf + buf->pos,
                                    printbuf_remaining_size(buf));
 }
-EXPORT_SYMBOL(prt_human_readable_u64);
 
 /**
  * prt_human_readable_s64 - Print out a s64 in human readable units
  *
  * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
  */
-void prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
 {
        if (v < 0)
                prt_char(buf, '-');
-       prt_human_readable_u64(buf, abs(v));
+       bch2_prt_human_readable_u64(buf, abs(v));
 }
-EXPORT_SYMBOL(prt_human_readable_s64);
 
 /**
  * prt_units_u64 - Print out a u64 according to printbuf unit options
@@ -344,14 +365,13 @@ EXPORT_SYMBOL(prt_human_readable_s64);
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
  */
-void prt_units_u64(struct printbuf *out, u64 v)
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
 {
        if (out->human_readable_units)
-               prt_human_readable_u64(out, v);
+               bch2_prt_human_readable_u64(out, v);
        else
-               prt_printf(out, "%llu", v);
+               bch2_prt_printf(out, "%llu", v);
 }
-EXPORT_SYMBOL(prt_units_u64);
 
 /**
  * prt_units_s64 - Print out a s64 according to printbuf unit options
@@ -359,10 +379,37 @@ EXPORT_SYMBOL(prt_units_u64);
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
  */
-void prt_units_s64(struct printbuf *out, s64 v)
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
 {
        if (v < 0)
                prt_char(out, '-');
-       prt_units_u64(out, abs(v));
+       bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+                           const char * const list[],
+                           size_t selected)
+{
+       size_t i;
+
+       for (i = 0; list[i]; i++)
+               bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+                      const char * const list[], u64 flags)
+{
+       unsigned bit, nr = 0;
+       bool first = true;
+
+       while (list[nr])
+               nr++;
+
+       while (flags && (bit = __ffs(flags)) < nr) {
+               if (!first)
+                       bch2_prt_printf(out, ",");
+               first = false;
+               bch2_prt_printf(out, "%s", list[bit]);
+               flags ^= 1 << bit;
+       }
 }
-EXPORT_SYMBOL(prt_units_s64);
similarity index 76%
rename from include/linux/printbuf.h
rename to libbcachefs/printbuf.h
index 24e62e56d18c1c19d7e40f3cb7a871bc208bcf8e..2e99399578336a3692c7f438a71ee48182d20116 100644 (file)
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: LGPL-2.1+ */
 /* Copyright (C) 2022 Kent Overstreet */
 
-#ifndef _LINUX_PRINTBUF_H
-#define _LINUX_PRINTBUF_H
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
 
 /*
  * Printbufs: Simple strings for printing to, with optional heap allocation
@@ -100,26 +100,30 @@ struct printbuf {
        u8                      _tabstops[PRINTBUF_INLINE_TABSTOPS];
 };
 
-int printbuf_make_room(struct printbuf *, unsigned);
-const char *printbuf_str(const struct printbuf *);
-void printbuf_exit(struct printbuf *);
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
 
-void printbuf_tabstops_reset(struct printbuf *);
-void printbuf_tabstop_pop(struct printbuf *);
-int printbuf_tabstop_push(struct printbuf *, unsigned);
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
 
-void printbuf_indent_add(struct printbuf *, unsigned);
-void printbuf_indent_sub(struct printbuf *, unsigned);
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
 
-void prt_newline(struct printbuf *);
-void prt_tab(struct printbuf *);
-void prt_tab_rjust(struct printbuf *);
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
 
-void prt_bytes_indented(struct printbuf *, const char *, unsigned);
-void prt_human_readable_u64(struct printbuf *, u64);
-void prt_human_readable_s64(struct printbuf *, s64);
-void prt_units_u64(struct printbuf *, u64);
-void prt_units_s64(struct printbuf *, s64);
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
 
 /* Initializer for a heap allocated printbuf: */
 #define PRINTBUF ((struct printbuf) { .heap_allocated = true })
@@ -163,7 +167,7 @@ static inline bool printbuf_overflowed(struct printbuf *out)
 
 static inline void printbuf_nul_terminate(struct printbuf *out)
 {
-       printbuf_make_room(out, 1);
+       bch2_printbuf_make_room(out, 1);
 
        if (out->pos < out->size)
                out->buf[out->pos] = 0;
@@ -171,7 +175,7 @@ static inline void printbuf_nul_terminate(struct printbuf *out)
                out->buf[out->size - 1] = 0;
 }
 
-/* Doesn't call printbuf_make_room(), doesn't nul terminate: */
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
 static inline void __prt_char_reserved(struct printbuf *out, char c)
 {
        if (printbuf_remaining(out))
@@ -182,7 +186,7 @@ static inline void __prt_char_reserved(struct printbuf *out, char c)
 /* Doesn't nul terminate: */
 static inline void __prt_char(struct printbuf *out, char c)
 {
-       printbuf_make_room(out, 1);
+       bch2_printbuf_make_room(out, 1);
        __prt_char_reserved(out, c);
 }
 
@@ -203,7 +207,7 @@ static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n
 
 static inline void prt_chars(struct printbuf *out, char c, unsigned n)
 {
-       printbuf_make_room(out, n);
+       bch2_printbuf_make_room(out, n);
        __prt_chars_reserved(out, c, n);
        printbuf_nul_terminate(out);
 }
@@ -212,7 +216,7 @@ static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
 {
        unsigned i, can_print;
 
-       printbuf_make_room(out, n);
+       bch2_printbuf_make_room(out, n);
 
        can_print = min(n, printbuf_remaining(out));
 
@@ -230,12 +234,12 @@ static inline void prt_str(struct printbuf *out, const char *str)
 
 static inline void prt_str_indented(struct printbuf *out, const char *str)
 {
-       prt_bytes_indented(out, str, strlen(str));
+       bch2_prt_bytes_indented(out, str, strlen(str));
 }
 
 static inline void prt_hex_byte(struct printbuf *out, u8 byte)
 {
-       printbuf_make_room(out, 2);
+       bch2_printbuf_make_room(out, 2);
        __prt_char_reserved(out, hex_asc_hi(byte));
        __prt_char_reserved(out, hex_asc_lo(byte));
        printbuf_nul_terminate(out);
@@ -243,7 +247,7 @@ static inline void prt_hex_byte(struct printbuf *out, u8 byte)
 
 static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
 {
-       printbuf_make_room(out, 2);
+       bch2_printbuf_make_room(out, 2);
        __prt_char_reserved(out, hex_asc_upper_hi(byte));
        __prt_char_reserved(out, hex_asc_upper_lo(byte));
        printbuf_nul_terminate(out);
@@ -277,30 +281,4 @@ static inline void printbuf_atomic_dec(struct printbuf *buf)
        buf->atomic--;
 }
 
-/*
- * This is used for the %pf(%p) sprintf format extension, where we pass a pretty
- * printer and arguments to the pretty-printer to sprintf
- *
- * Instead of passing a pretty-printer function to sprintf directly, we pass it
- * a pointer to a struct call_pp, so that sprintf can check that the magic
- * number is present, which in turn ensures that the CALL_PP() macro has been
- * used in order to typecheck the arguments to the pretty printer function
- *
- * Example usage:
- *   sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev));
- */
-struct call_pp {
-       unsigned long   magic;
-       void            *fn;
-};
-
-#define PP_TYPECHECK(fn, ...)                                  \
-       ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); })
-
-#define CALL_PP_MAGIC          (unsigned long) 0xce0b92d22f6b6be4
-
-#define CALL_PP(fn, ...)                                       \
-       (PP_TYPECHECK(fn, ##__VA_ARGS__),                       \
-        &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__
-
-#endif /* _LINUX_PRINTBUF_H */
+#endif /* _BCACHEFS_PRINTBUF_H */
index 4b663f320bfc904fbab9578b7ff208f3f871d473..331f22835d1859574cd073784242d557e130d7cd 100644 (file)
@@ -59,7 +59,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 };
 
 int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                      int rw, struct printbuf *err)
+                      unsigned flags, struct printbuf *err)
 {
        if (k.k->p.inode >= QTYP_NR) {
                prt_printf(err, "invalid quota type (%llu >= %u)",
index 59bed1148201c3dd73ea33672ec60af692880588..146264fd16ce0bf24a5dafe285b4921b0355d79f 100644 (file)
@@ -7,7 +7,7 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota ((struct bkey_ops) {       \
index 8df94ad5d505594f46b9e132e6bc874df65194e5..8a78377bf9c5991846c7e4202107cac7d1b00656 100644 (file)
@@ -969,7 +969,7 @@ static int read_btree_roots(struct bch_fs *c)
                                   ? FSCK_CAN_IGNORE : 0,
                                   "error reading btree root %s",
                                   bch2_btree_ids[i]);
-                       if (i == BTREE_ID_alloc)
+                       if (btree_id_is_alloc(i))
                                c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                }
        }
@@ -1217,6 +1217,9 @@ use_clean:
        if (ret)
                goto err;
 
+       if (c->opts.reconstruct_alloc)
+               bch2_fs_log_msg(c, "dropping alloc info");
+
        /*
         * Skip past versions that might have possibly been used (as nonces),
         * but hadn't had their pointers written:
@@ -1250,6 +1253,20 @@ use_clean:
 
        bch2_stripes_heap_start(c);
 
+       if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+               err = "error creating root snapshot node";
+               ret = bch2_fs_initialize_subvolumes(c);
+               if (ret)
+                       goto err;
+       }
+
+       bch_verbose(c, "reading snapshots table");
+       err = "error reading snapshots table";
+       ret = bch2_fs_snapshots_start(c);
+       if (ret)
+               goto err;
+       bch_verbose(c, "reading snapshots done");
+
        if (c->opts.fsck) {
                bool metadata_only = c->opts.norecovery;
 
@@ -1262,20 +1279,6 @@ use_clean:
 
                set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
-               if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-                       err = "error creating root snapshot node";
-                       ret = bch2_fs_initialize_subvolumes(c);
-                       if (ret)
-                               goto err;
-               }
-
-               bch_verbose(c, "reading snapshots table");
-               err = "error reading snapshots table";
-               ret = bch2_fs_snapshots_start(c);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "reading snapshots done");
-
                set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
                bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
@@ -1343,20 +1346,6 @@ use_clean:
                if (c->opts.norecovery)
                        goto out;
 
-               if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-                       err = "error creating root snapshot node";
-                       ret = bch2_fs_initialize_subvolumes(c);
-                       if (ret)
-                               goto err;
-               }
-
-               bch_verbose(c, "reading snapshots table");
-               err = "error reading snapshots table";
-               ret = bch2_fs_snapshots_start(c);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "reading snapshots done");
-
                set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
                bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
@@ -1632,6 +1621,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 
        return 0;
 err:
-       pr_err("Error initializing new filesystem: %s (%i)", err, ret);
+       pr_err("Error initializing new filesystem: %s (%s)", err, bch2_err_str(ret));
        return ret;
 }
index e89a9a1abe9f723d65eb4b52b6974b79fb33d165..87446f7bad4f81d2b0a71c60e32e20bf35dfb4a6 100644 (file)
@@ -26,7 +26,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 /* reflink pointers */
 
 int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          int rw, struct printbuf *err)
+                          unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
@@ -78,7 +78,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect extents */
 
 int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          int rw, struct printbuf *err)
+                          unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
@@ -88,7 +88,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       return bch2_bkey_ptrs_invalid(c, k, rw, err);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -131,7 +131,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 /* indirect inline data */
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                     int rw, struct printbuf *err)
+                                     unsigned flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
                prt_printf(err, "incorrect value size (%zu < %zu)",
@@ -282,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        u32 dst_snapshot, src_snapshot;
        int ret = 0, ret2 = 0;
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
                return -BCH_ERR_erofs_no_writes;
 
        bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -416,7 +416,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        bch2_bkey_buf_exit(&new_src, c);
        bch2_bkey_buf_exit(&new_dst, c);
 
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
 
        return dst_done ?: ret ?: ret2;
 }
index ce0012aa99c6a506a76aeb3132ceff05b6106e32..2391037c2ece6ca5ad2ec1ac4c3f69bf7f352037 100644 (file)
@@ -3,7 +3,7 @@
 #define _BCACHEFS_REFLINK_H
 
 int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
-                          int, struct printbuf *);
+                          unsigned, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -17,7 +17,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 })
 
 int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
-                          int, struct printbuf *);
+                          unsigned, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
@@ -32,7 +32,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 })
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
-                                     int, struct printbuf *);
+                                     unsigned, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
                                struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
index 3bff21959d986d388043e978d1525888702b2be8..482bedf4be8ba2cd696dd5afae09230281ae8aa0 100644 (file)
@@ -299,6 +299,13 @@ static int replicas_table_update(struct bch_fs *c,
 
        memset(new_usage, 0, sizeof(new_usage));
 
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+                                       sizeof(u64), GFP_KERNEL)))
+                       goto err;
+
+       memset(new_usage, 0, sizeof(new_usage));
+
        for (i = 0; i < ARRAY_SIZE(new_usage); i++)
                if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
                                        sizeof(u64), GFP_KERNEL)))
index cc34b3809206fb1f5666ba41ad42b7958a39e307..4887675a86f09c7a3942f3eae33d76179fe3c7bc 100644 (file)
@@ -27,22 +27,6 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
 int bch2_mark_replicas(struct bch_fs *,
                       struct bch_replicas_entry *);
 
-struct replicas_delta {
-       s64                     delta;
-       struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
-       unsigned                size;
-       unsigned                used;
-
-       struct                  {} memset_start;
-       u64                     nr_inodes;
-       u64                     persistent_reserved[BCH_REPLICAS_MAX];
-       struct                  {} memset_end;
-       struct replicas_delta   d[0];
-};
-
 static inline struct replicas_delta *
 replicas_delta_next(struct replicas_delta *d)
 {
index f12a35b3dbcf3b170dd66760e5b7944fad7fb88f..5cfff489bbc34860e9e2a833617f9298653b255a 100644 (file)
@@ -8,4 +8,20 @@ struct bch_replicas_cpu {
        struct bch_replicas_entry *entries;
 };
 
+struct replicas_delta {
+       s64                     delta;
+       struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+       unsigned                size;
+       unsigned                used;
+
+       struct                  {} memset_start;
+       u64                     nr_inodes;
+       u64                     persistent_reserved[BCH_REPLICAS_MAX];
+       struct                  {} memset_end;
+       struct replicas_delta   d[0];
+};
+
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
index d090a74bd05286c467af5f063cacdac8e77d7598..1805c8542d65381605a5506a5c82554102524587 100644 (file)
@@ -25,7 +25,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         int rw, struct printbuf *err)
+                         unsigned flags, struct printbuf *err)
 {
        struct bkey_s_c_snapshot s;
        u32 i, id;
@@ -706,16 +706,14 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
        struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 
        bch2_delete_dead_snapshots(c);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-       if (!percpu_ref_tryget_live(&c->writes))
-               return;
-
-       if (!queue_work(system_long_wq, &c->snapshot_delete_work))
-               percpu_ref_put(&c->writes);
+       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+           !queue_work(system_long_wq, &c->snapshot_delete_work))
+               bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
@@ -735,7 +733,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          int rw, struct printbuf *err)
+                          unsigned flags, struct printbuf *err)
 {
        if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
            bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
@@ -900,7 +898,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
                darray_exit(&s);
        }
 
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
 }
 
 struct subvolume_unlink_hook {
@@ -923,11 +921,11 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
                return -EROFS;
 
        if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
-               percpu_ref_put(&c->writes);
+               bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
        return 0;
 }
 
index c694c1c24483beeebb6f5534be511a21d7a3ba3e..b6740eab78d3d588f0813e9a2433d45196d7271c 100644 (file)
@@ -7,7 +7,7 @@
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-                         int rw, struct printbuf *);
+                         unsigned, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {            \
        .key_invalid    = bch2_snapshot_invalid,                \
@@ -106,7 +106,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-                          int rw, struct printbuf *);
+                          unsigned, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {           \
index 738b68b5d35cbfad6460d4bf78f914b0cb383ffd..00c1f69bbe346e0b135b720671d93b91ad421b21 100644 (file)
@@ -20,7 +20,6 @@
 #include "counters.h"
 
 #include <linux/backing-dev.h>
-#include <linux/pretty-printers.h>
 #include <linux/sort.h>
 
 #include <trace/events/bcachefs.h>
@@ -1261,7 +1260,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 
                u->entry.type   = BCH_JSET_ENTRY_data_usage;
                u->v            = cpu_to_le64(c->usage_base->replicas[i]);
-               memcpy(&u->r, e, replicas_entry_bytes(e));
+               unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+                             "embedded variable length struct");
        }
 
        for_each_member_device(ca, c, dev) {
index 95c16f70512f1c06bb849ce65b3dac7144eb5000..08bfed1bbf88e4c15dbbb48b885c3b23af9fefe3 100644 (file)
@@ -55,7 +55,6 @@
 #include <linux/idr.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/pretty-printers.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
 #include <crypto/hash.h>
@@ -110,7 +109,7 @@ static struct kset *bcachefs_kset;
 static LIST_HEAD(bch_fs_list);
 static DEFINE_MUTEX(bch_fs_list_lock);
 
-static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
 
 static void bch2_dev_free(struct bch_dev *);
 static int bch2_dev_alloc(struct bch_fs *, unsigned);
@@ -238,13 +237,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
                bch2_dev_allocator_remove(c, ca);
 }
 
+#ifndef BCH_WRITE_REF_DEBUG
 static void bch2_writes_disabled(struct percpu_ref *writes)
 {
        struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 
        set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-       wake_up(&bch_read_only_wait);
+       wake_up(&bch2_read_only_wait);
 }
+#endif
 
 void bch2_fs_read_only(struct bch_fs *c)
 {
@@ -259,9 +260,13 @@ void bch2_fs_read_only(struct bch_fs *c)
         * Block new foreground-end write operations from starting - any new
         * writes will return -EROFS:
         */
+       set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
        percpu_ref_kill(&c->writes);
-
-       cancel_work_sync(&c->ec_stripe_delete_work);
+#else
+       for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+               bch2_write_ref_put(c, i);
+#endif
 
        /*
         * If we're not doing an emergency shutdown, we want to wait on
@@ -274,16 +279,17 @@ void bch2_fs_read_only(struct bch_fs *c)
         * we do need to wait on them before returning and signalling
         * that going RO is complete:
         */
-       wait_event(bch_read_only_wait,
+       wait_event(bch2_read_only_wait,
                   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
                   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
        __bch2_fs_read_only(c);
 
-       wait_event(bch_read_only_wait,
+       wait_event(bch2_read_only_wait,
                   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
        clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+       clear_bit(BCH_FS_GOING_RO, &c->flags);
 
        if (!bch2_journal_error(&c->journal) &&
            !test_bit(BCH_FS_ERROR, &c->flags) &&
@@ -320,7 +326,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
        bch2_journal_halt(&c->journal);
        bch2_fs_read_only_async(c);
 
-       wake_up(&bch_read_only_wait);
+       wake_up(&bch2_read_only_wait);
        return ret;
 }
 
@@ -392,20 +398,26 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                return ret;
        }
 
-       schedule_work(&c->ec_stripe_delete_work);
-
-       bch2_do_discards(c);
-       bch2_do_invalidates(c);
-
        if (!early) {
                ret = bch2_fs_read_write_late(c);
                if (ret)
                        goto err;
        }
 
+#ifndef BCH_WRITE_REF_DEBUG
        percpu_ref_reinit(&c->writes);
+#else
+       for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+               BUG_ON(atomic_long_read(&c->writes[i]));
+               atomic_long_inc(&c->writes[i]);
+       }
+#endif
        set_bit(BCH_FS_RW, &c->flags);
        set_bit(BCH_FS_WAS_RW, &c->flags);
+
+       bch2_do_discards(c);
+       bch2_do_invalidates(c);
+       bch2_do_stripe_deletes(c);
        return 0;
 err:
        __bch2_fs_read_only(c);
@@ -454,19 +466,21 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_journal_keys_free(&c->journal_keys);
        bch2_journal_entries_free(c);
        percpu_free_rwsem(&c->mark_lock);
+       free_percpu(c->online_reserved);
 
        if (c->btree_paths_bufs)
                for_each_possible_cpu(cpu)
                        kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
 
-       free_percpu(c->online_reserved);
        free_percpu(c->btree_paths_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
        mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
        percpu_ref_exit(&c->writes);
+#endif
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
        kfree(c->unused_inode_hints);
@@ -695,6 +709,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        seqcount_init(&c->usage_lock);
 
+       sema_init(&c->io_in_flight, 128);
+
        c->copy_gc_enabled              = 1;
        c->rebalance.enabled            = 1;
        c->promote_whole_extents        = true;
@@ -743,9 +759,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        bch2_opts_apply(&c->opts, opts);
 
-       /* key cache currently disabled for inodes, because of snapshots: */
-       c->opts.inodes_use_key_cache = 0;
-
        c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
        if (c->opts.inodes_use_key_cache)
                c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
@@ -766,23 +779,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
        if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+                               WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
            !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+                               WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
            !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
                                WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+#ifndef BCH_WRITE_REF_DEBUG
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
            mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
            bioset_init(&c->btree_bio, 1,
                        max(offsetof(struct btree_read_bio, bio),
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
-           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
+           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
@@ -850,9 +865,12 @@ static void print_mount_opts(struct bch_fs *c)
        struct printbuf p = PRINTBUF;
        bool first = true;
 
+       prt_printf(&p, "mounted version=%s", bch2_metadata_versions[c->sb.version]);
+
        if (c->opts.read_only) {
-               prt_printf(&p, "ro");
+               prt_str(&p, " opts=");
                first = false;
+               prt_printf(&p, "ro");
        }
 
        for (i = 0; i < bch2_opts_nr; i++) {
@@ -865,16 +883,12 @@ static void print_mount_opts(struct bch_fs *c)
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
                        continue;
 
-               if (!first)
-                       prt_printf(&p, ",");
+               prt_str(&p, first ? " opts=" : ",");
                first = false;
                bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
        }
 
-       if (!p.pos)
-               prt_printf(&p, "(null)");
-
-       bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+       bch_info(c, "%s", p.buf);
        printbuf_exit(&p);
 }
 
@@ -1955,5 +1969,8 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
+unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
 module_exit(bcachefs_exit);
 module_init(bcachefs_init);
index 3c83e9b9cb7b6b50248a441945aee545d683cf4d..d4e939c808faf5adb95a4edf24da4b271880b0d2 100644 (file)
@@ -251,7 +251,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
  */
 static inline void bch2_fs_lazy_rw(struct bch_fs *c)
 {
-       if (percpu_ref_is_zero(&c->writes))
+       if (!test_bit(BCH_FS_RW, &c->flags) &&
+           !test_bit(BCH_FS_WAS_RW, &c->flags))
                bch2_fs_read_write_early(c);
 }
 
index 6e49cf98f60d5da80b7626e95079f0121a5d0aca..ebd10cd52b9ba91dbd3f2ce76418be2aef0053e5 100644 (file)
@@ -35,7 +35,6 @@
 #include "tests.h"
 
 #include <linux/blkdev.h>
-#include <linux/pretty-printers.h>
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
@@ -195,8 +194,32 @@ read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(write_points);
 read_attribute(nocow_lock_table);
 
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+const char * const bch2_write_refs[] = {
+#define x(n)   #n,
+       BCH_WRITE_REFS()
+#undef x
+       NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       bch2_printbuf_tabstop_push(out, 24);
+
+       for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+               prt_str(out, bch2_write_refs[i]);
+               prt_tab(out);
+               prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+               prt_newline(out);
+       }
+}
+#endif
+
 read_attribute(internal_uuid);
 
 read_attribute(has_data);
@@ -432,6 +455,9 @@ SHOW(bch2_fs)
        if (attr == &sysfs_open_buckets)
                bch2_open_buckets_to_text(out, c);
 
+       if (attr == &sysfs_write_points)
+               bch2_write_points_to_text(out, c);
+
        if (attr == &sysfs_compression_stats)
                bch2_compression_stats_to_text(out, c);
 
@@ -450,6 +476,11 @@ SHOW(bch2_fs)
        if (attr == &sysfs_nocow_lock_table)
                bch2_nocow_locks_to_text(out, &c->nocow_locks);
 
+#ifdef BCH_WRITE_REF_DEBUG
+       if (attr == &sysfs_write_refs)
+               bch2_write_refs_to_text(out, c);
+#endif
+
        return 0;
 }
 
@@ -632,7 +663,11 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_new_stripes,
        &sysfs_stripes_heap,
        &sysfs_open_buckets,
+       &sysfs_write_points,
        &sysfs_nocow_lock_table,
+#ifdef BCH_WRITE_REF_DEBUG
+       &sysfs_write_refs,
+#endif
        &sysfs_io_timers_read,
        &sysfs_io_timers_write,
 
@@ -684,7 +719,7 @@ STORE(bch2_fs_opts_dir)
         * We don't need to take c->writes for correctness, but it eliminates an
         * unsightly error message in the dmesg log when we're RO:
         */
-       if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+       if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
                return -EROFS;
 
        tmp = kstrdup(buf, GFP_KERNEL);
@@ -714,7 +749,7 @@ STORE(bch2_fs_opts_dir)
 
        ret = size;
 err:
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
        return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);
index b99a9e421f98b8cb1a61ca12b0cfbcc34a6fcb40..80fce1c954709fb45fd2139442c210902ae72258 100644 (file)
@@ -573,7 +573,7 @@ static u64 test_rand(void)
 {
        u64 v;
 
-       prandom_bytes(&v, sizeof(v));
+       get_random_bytes(&v, sizeof(v));
        return v;
 }
 
index bb8a495e2290a847d222e9cc0f91d87830bc7bea..9939bf2a8f6caf06a3874e5690871a7b9728b298 100644 (file)
@@ -240,12 +240,12 @@ bool bch2_is_zero(const void *_p, size_t n)
        return true;
 }
 
-static void bch2_quantiles_update(struct quantiles *q, u64 v)
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
 {
        unsigned i = 0;
 
        while (i < ARRAY_SIZE(q->entries)) {
-               struct quantile_entry *e = q->entries + i;
+               struct bch2_quantile_entry *e = q->entries + i;
 
                if (unlikely(!e->step)) {
                        e->m = v;
@@ -292,7 +292,6 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
                if (!*p)
                        break;
                lines = p + 1;
-               prefix = KERN_CONT;
        }
        console_unlock();
 }
@@ -301,11 +300,9 @@ int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
 {
        unsigned long entries[32];
        unsigned i, nr_entries;
-       int ret;
 
-       ret = down_read_killable(&task->signal->exec_update_lock);
-       if (ret)
-               return ret;
+       if (!down_read_trylock(&task->signal->exec_update_lock))
+               return 0;
 
        nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
        for (i = 0; i < nr_entries; i++) {
@@ -319,7 +316,8 @@ int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
 
 /* time stats: */
 
-static inline void bch2_time_stats_update_one(struct time_stats *stats,
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
                                              u64 start, u64 end)
 {
        u64 duration, freq;
@@ -348,10 +346,10 @@ static inline void bch2_time_stats_update_one(struct time_stats *stats,
        }
 }
 
-static noinline void bch2_time_stats_clear_buffer(struct time_stats *stats,
-                                                 struct time_stat_buffer *b)
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+                                                 struct bch2_time_stat_buffer *b)
 {
-       struct time_stat_buffer_entry *i;
+       struct bch2_time_stat_buffer_entry *i;
        unsigned long flags;
 
        spin_lock_irqsave(&stats->lock, flags);
@@ -364,7 +362,7 @@ static noinline void bch2_time_stats_clear_buffer(struct time_stats *stats,
        b->nr = 0;
 }
 
-void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 {
        unsigned long flags;
 
@@ -379,17 +377,17 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
                if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
                    stats->duration_stats.n > 1024)
                        stats->buffer =
-                               alloc_percpu_gfp(struct time_stat_buffer,
+                               alloc_percpu_gfp(struct bch2_time_stat_buffer,
                                                 GFP_ATOMIC);
                spin_unlock_irqrestore(&stats->lock, flags);
        } else {
-               struct time_stat_buffer *b;
+               struct bch2_time_stat_buffer *b;
 
                preempt_disable();
                b = this_cpu_ptr(stats->buffer);
 
                BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-               b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+               b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
                        .start = start,
                        .end = end
                };
@@ -399,6 +397,7 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
                preempt_enable();
        }
 }
+#endif
 
 static const struct time_unit {
        const char      *name;
@@ -426,7 +425,14 @@ static const struct time_unit *pick_time_units(u64 ns)
        return u;
 }
 
-static void pr_time_units(struct printbuf *out, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+       const struct time_unit *u = pick_time_units(ns);
+
+       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
        const struct time_unit *u = pick_time_units(ns);
 
@@ -441,11 +447,11 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 {
        prt_str(out, name);
        prt_tab(out);
-       pr_time_units(out, ns);
+       bch2_pr_time_units_aligned(out, ns);
        prt_newline(out);
 }
 
-void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
        const struct time_unit *u;
        s64 f_mean = 0, d_mean = 0;
@@ -499,16 +505,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 
        prt_printf(out, "mean:");
        prt_tab(out);
-       pr_time_units(out, d_mean);
+       bch2_pr_time_units_aligned(out, d_mean);
        prt_tab(out);
-       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
        prt_newline(out);
 
        prt_printf(out, "stddev:");
        prt_tab(out);
-       pr_time_units(out, d_stddev);
+       bch2_pr_time_units_aligned(out, d_stddev);
        prt_tab(out);
-       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
 
        printbuf_indent_sub(out, 2);
        prt_newline(out);
@@ -522,16 +528,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 
        prt_printf(out, "mean:");
        prt_tab(out);
-       pr_time_units(out, f_mean);
+       bch2_pr_time_units_aligned(out, f_mean);
        prt_tab(out);
-       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
        prt_newline(out);
 
        prt_printf(out, "stddev:");
        prt_tab(out);
-       pr_time_units(out, f_stddev);
+       bch2_pr_time_units_aligned(out, f_stddev);
        prt_tab(out);
-       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
 
        printbuf_indent_sub(out, 2);
        prt_newline(out);
@@ -554,12 +560,12 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
        }
 }
 
-void bch2_time_stats_exit(struct time_stats *stats)
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
 {
        free_percpu(stats->buffer);
 }
 
-void bch2_time_stats_init(struct time_stats *stats)
+void bch2_time_stats_init(struct bch2_time_stats *stats)
 {
        memset(stats, 0, sizeof(*stats));
        stats->duration_stats_weighted.w = 8;
index 473c96968121b3c13f6990755b7b25778a786bcd..09e272932dff388b2962f853898f6f3786a34338 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
-#include <linux/printbuf.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
@@ -215,6 +214,34 @@ do {                                                                       \
 #define ANYSINT_MAX(t)                                                 \
        ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
+#include "printbuf.h"
+
+#define prt_vprintf(_out, ...)         bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...)          bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf)             bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf)            bch2_printbuf_exit(_buf)
+
+#define printbuf_tabstops_reset(_buf)  bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf)     bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n)        bch2_printbuf_tabstop_push(_buf, _n)
+
+#define printbuf_indent_add(_out, _n)  bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n)  bch2_printbuf_indent_sub(_out, _n)
+
+#define prt_newline(_out)              bch2_prt_newline(_out)
+#define prt_tab(_out)                  bch2_prt_tab(_out)
+#define prt_tab_rjust(_out)            bch2_prt_tab_rjust(_out)
+
+#define prt_bytes_indented(...)                bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v)              prt_printf(_out, "%llu", _v)
+#define prt_human_readable_u64(...)    bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...)    bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...)             bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...)             bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...)         bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...)              bch2_prt_bitflags(__VA_ARGS__)
+
+void bch2_pr_time_units(struct printbuf *, u64);
 
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
@@ -340,22 +367,22 @@ int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
 #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
 #define QUANTILE_LAST  eytzinger0_last(NR_QUANTILES)
 
-struct quantiles {
-       struct quantile_entry {
+struct bch2_quantiles {
+       struct bch2_quantile_entry {
                u64     m;
                u64     step;
        }               entries[NR_QUANTILES];
 };
 
-struct time_stat_buffer {
+struct bch2_time_stat_buffer {
        unsigned        nr;
-       struct time_stat_buffer_entry {
+       struct bch2_time_stat_buffer_entry {
                u64     start;
                u64     end;
        }               entries[32];
 };
 
-struct time_stats {
+struct bch2_time_stats {
        spinlock_t      lock;
        /* all fields are in nanoseconds */
        u64             max_duration;
@@ -363,26 +390,30 @@ struct time_stats {
        u64             max_freq;
        u64             min_freq;
        u64             last_event;
-       struct quantiles quantiles;
+       struct bch2_quantiles quantiles;
 
        struct mean_and_variance          duration_stats;
        struct mean_and_variance_weighted duration_stats_weighted;
        struct mean_and_variance          freq_stats;
        struct mean_and_variance_weighted freq_stats_weighted;
-       struct time_stat_buffer __percpu *buffer;
+       struct bch2_time_stat_buffer __percpu *buffer;
 };
 
-void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
 
-static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
 {
        __bch2_time_stats_update(stats, start, local_clock());
 }
 
-void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
-void bch2_time_stats_exit(struct time_stats *);
-void bch2_time_stats_init(struct time_stats *);
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
 
 #define ewma_add(ewma, val, weight)                                    \
 ({                                                                     \
@@ -582,6 +613,20 @@ static inline void memmove_u64s_down(void *dst, const void *src,
        __memmove_u64s_down(dst, src, u64s);
 }
 
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+                                      unsigned u64s)
+{
+       memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+                                    unsigned u64s)
+{
+       EBUG_ON(dst > src);
+
+       __memmove_u64s_down_small(dst, src, u64s);
+}
+
 static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
                                           unsigned u64s)
 {
index b5022a8b38c79bab1d9cf8b28ff9c7e94b32ede5..9f77bb2ecf5fe741aae306f3dee90efc94dc17b1 100644 (file)
@@ -70,7 +70,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 };
 
 int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                      int rw, struct printbuf *err)
+                      unsigned flags, struct printbuf *err)
 {
        const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
index 03f1b73fc926290a90fe9b8af13274c3b88c1cde..1a4cff3a9d962aaa994a0ce554a3c2fdb4e4fab0 100644 (file)
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {       \
index 54cd6e9cc2d34ab5b088ad1c613a694deaf638a4..0a5cedfea3c037140fcf4040dc66cf435c1cee73 100644 (file)
@@ -184,7 +184,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        if (buffered_fd < 0)
                return ERR_PTR(-errno);
 
-       fd = open(path, flags|O_DIRECT);
+       fd = open(path, flags);
        if (fd < 0)
                fd = dup(buffered_fd);
        if (fd < 0) {
@@ -192,7 +192,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                return ERR_PTR(-errno);
        }
 
-       sync_fd = open(path, flags|O_DIRECT|O_SYNC);
+       sync_fd = open(path, flags|O_SYNC);
        if (sync_fd < 0)
                sync_fd = open(path, flags|O_SYNC);
        if (sync_fd < 0) {
index 55d46c9dc0382519af07bb6c70bc7c73c6d0d645..bd08da5f9e70c0c108fcb99932b82fb82f52df5d 100644 (file)
@@ -42,8 +42,6 @@
 #include <linux/math64.h>
 #include <linux/mean_and_variance.h>
 #include <linux/module.h>
-#include <linux/printbuf.h>
-
 
 /**
  * fast_divpow2() - fast approximation for n / (1 << d)
diff --git a/linux/pretty-printers.c b/linux/pretty-printers.c
deleted file mode 100644 (file)
index addbac9..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1+
-/* Copyright (C) 2022 Kent Overstreet */
-
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/printbuf.h>
-#include <linux/pretty-printers.h>
-
-/**
- * prt_string_option - Given a list of strings, print out the list and indicate
- * which option is selected, with square brackets (sysfs style)
- *
- * @out: The printbuf to output to
- * @list: List of strings to choose from
- * @selected: The option to highlight, with square brackets
- */
-void prt_string_option(struct printbuf *out,
-                      const char * const list[],
-                      size_t selected)
-{
-       size_t i;
-
-       for (i = 0; list[i]; i++) {
-               if (i)
-                       prt_char(out, ' ');
-               if (i == selected)
-                       prt_char(out, '[');
-               prt_str(out, list[i]);
-               if (i == selected)
-                       prt_char(out, ']');
-       }
-}
-EXPORT_SYMBOL(prt_string_option);
-
-/**
- * prt_bitflags: Given a bitmap and a list of names for each bit, print out which
- * bits are on, comma separated
- *
- * @out: The printbuf to output to
- * @list: List of names for each bit
- * @flags: Bits to print
- */
-void prt_bitflags(struct printbuf *out,
-                 const char * const list[], u64 flags)
-{
-       unsigned bit, nr = 0;
-       bool first = true;
-
-       while (list[nr])
-               nr++;
-
-       while (flags && (bit = __ffs(flags)) < nr) {
-               if (!first)
-                       prt_char(out, ',');
-               first = false;
-               prt_str(out, list[bit]);
-               flags ^= 1 << bit;
-       }
-}
-EXPORT_SYMBOL(prt_bitflags);
diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c
deleted file mode 100644 (file)
index 0ae56ee..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-
-#include <stdio.h>
-#include <linux/printbuf.h>
-
-void prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
-{
-       int len;
-
-       do {
-               va_list args2;
-
-               va_copy(args2, args);
-               len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
-       } while (len + 1 >= printbuf_remaining(out) &&
-                !printbuf_make_room(out, len + 1));
-
-       len = min_t(size_t, len,
-                 printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
-       out->pos += len;
-}
-
-void prt_printf(struct printbuf *out, const char *fmt, ...)
-{
-       va_list args;
-
-       va_start(args, fmt);
-       prt_vprintf(out, fmt, args);
-       va_end(args);
-}
-
-void prt_u64(struct printbuf *out, u64 v)
-{
-       prt_printf(out, "%llu", v);
-}
diff --git a/linux/seq_buf.c b/linux/seq_buf.c
new file mode 100644 (file)
index 0000000..cf8709a
--- /dev/null
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * seq_buf.c
+ *
+ * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The seq_buf is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the seq_buf must be initialized with seq_buf_init().
+ * This will set up the counters within the descriptor. You can call
+ * seq_buf_init() more than once to reset the seq_buf to start
+ * from scratch.
+ */
+#include <linux/seq_buf.h>
+#include <stdio.h>
+
+/**
+ * seq_buf_can_fit - can the new data fit in the current buffer?
+ * @s: the seq_buf descriptor
+ * @len: The length to see if it can fit in the current buffer
+ *
+ * Returns true if there's enough unused space in the seq_buf buffer
+ * to fit the amount of new data according to @len.
+ */
+static bool seq_buf_can_fit(struct seq_buf *s, size_t len)
+{
+       return s->len + len <= s->size;
+}
+
+/**
+ * seq_buf_vprintf - sequence printing of information.
+ * @s: seq_buf descriptor
+ * @fmt: printf format string
+ * @args: va_list of arguments from a printf() type function
+ *
+ * Writes a vnprintf() format into the sequencce buffer.
+ *
+ * Returns zero on success, -1 on overflow.
+ */
+int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args)
+{
+       int len;
+
+       WARN_ON(s->size == 0);
+
+       if (s->len < s->size) {
+               len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);
+               if (s->len + len < s->size) {
+                       s->len += len;
+                       return 0;
+               }
+       }
+       seq_buf_set_overflow(s);
+       return -1;
+}
+
+/**
+ * seq_buf_printf - sequence printing of information
+ * @s: seq_buf descriptor
+ * @fmt: printf format string
+ *
+ * Writes a printf() format into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow.
+ */
+int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       ret = seq_buf_vprintf(s, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+
+/**
+ * seq_buf_puts - sequence printing of simple string
+ * @s: seq_buf descriptor
+ * @str: simple string to record
+ *
+ * Copy a simple string into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_puts(struct seq_buf *s, const char *str)
+{
+       size_t len = strlen(str);
+
+       WARN_ON(s->size == 0);
+
+       /* Add 1 to len for the trailing null byte which must be there */
+       len += 1;
+
+       if (seq_buf_can_fit(s, len)) {
+               memcpy(s->buffer + s->len, str, len);
+               /* Don't count the trailing null byte against the capacity */
+               s->len += len - 1;
+               return 0;
+       }
+       seq_buf_set_overflow(s);
+       return -1;
+}
+
+/**
+ * seq_buf_putc - sequence printing of simple character
+ * @s: seq_buf descriptor
+ * @c: simple character to record
+ *
+ * Copy a single character into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_putc(struct seq_buf *s, unsigned char c)
+{
+       WARN_ON(s->size == 0);
+
+       if (seq_buf_can_fit(s, 1)) {
+               s->buffer[s->len++] = c;
+               return 0;
+       }
+       seq_buf_set_overflow(s);
+       return -1;
+}
+
+/**
+ * seq_buf_putmem - write raw data into the sequenc buffer
+ * @s: seq_buf descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len)
+{
+       WARN_ON(s->size == 0);
+
+       if (seq_buf_can_fit(s, len)) {
+               memcpy(s->buffer + s->len, mem, len);
+               s->len += len;
+               return 0;
+       }
+       seq_buf_set_overflow(s);
+       return -1;
+}
index 39a9bd6ecd78fdd966a278bf75bc12665eebef21..41337a7faeb9710b95cac7a97a9805fac278b9c6 100644 (file)
 #include <linux/six.h>
 #include <linux/slab.h>
 
+#include <trace/events/lock.h>
+
 #ifdef DEBUG
 #define EBUG_ON(cond)          BUG_ON(cond)
 #else
 #define EBUG_ON(cond)          do {} while (0)
 #endif
 
-#define six_acquire(l, t, r)   lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
-#define six_release(l)         lock_release(l, _RET_IP_)
+#define six_acquire(l, t, r, ip)       lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)             lock_release(l, ip)
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
@@ -278,19 +280,20 @@ static bool do_six_trylock_type(struct six_lock *lock,
 }
 
 __always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
+                              unsigned long ip)
 {
        if (!do_six_trylock_type(lock, type, true))
                return false;
 
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
        return true;
 }
 
 __always_inline __flatten
 static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-                             unsigned seq)
+                             unsigned seq, unsigned long ip)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old;
@@ -321,7 +324,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
                        six_lock_wakeup(lock, old, SIX_LOCK_write);
 
                if (ret)
-                       six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+                       six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 
                return ret;
        }
@@ -338,36 +341,48 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 
        six_set_owner(lock, type, old, current);
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
        return true;
 }
 
-/*
- * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's
- * off for now:
- */
-#ifdef SIX_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
-static inline bool six_optimistic_spin(struct six_lock *lock,
-                                      struct six_lock_waiter *wait)
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
 {
-       struct task_struct *owner, *task = current;
+       struct task_struct *owner;
+       bool ret;
 
-       switch (wait->lock_want) {
-       case SIX_LOCK_read:
-               break;
-       case SIX_LOCK_intent:
-               if (lock->wait_list.next != &wait->list)
-                       return false;
-               break;
-       case SIX_LOCK_write:
+       if (need_resched())
                return false;
-       }
 
        rcu_read_lock();
        owner = READ_ONCE(lock->owner);
+       ret = !owner || owner_on_cpu(owner);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline void six_set_nospin(struct six_lock *lock)
+{
+       union six_lock_state old, new;
+       u64 v = READ_ONCE(lock->state.v);
+
+       do {
+               new.v = old.v = v;
+               new.nospin = true;
+       } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
+}
 
-       while (owner && lock->owner == owner) {
+static inline bool six_spin_on_owner(struct six_lock *lock,
+                                    struct task_struct *owner,
+                                    u64 end_time)
+{
+       bool ret = true;
+       unsigned loop = 0;
+
+       rcu_read_lock();
+       while (lock->owner == owner) {
                /*
                 * Ensure we emit the owner->on_cpu, dereference _after_
                 * checking lock->owner still matches owner. If that fails,
@@ -376,27 +391,94 @@ static inline bool six_optimistic_spin(struct six_lock *lock,
                 */
                barrier();
 
+               if (!owner_on_cpu(owner) || need_resched()) {
+                       ret = false;
+                       break;
+               }
+
+               if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+                       six_set_nospin(lock);
+                       ret = false;
+                       break;
+               }
+
+               cpu_relax();
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+       struct task_struct *task = current;
+       u64 end_time;
+
+       if (type == SIX_LOCK_write)
+               return false;
+
+       preempt_disable();
+       if (!six_can_spin_on_owner(lock))
+               goto fail;
+
+       if (!osq_lock(&lock->osq))
+               goto fail;
+
+       end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
+       while (1) {
+               struct task_struct *owner;
+
+               /*
+                * If there's an owner, wait for it to either
+                * release the lock or go to sleep.
+                */
+               owner = READ_ONCE(lock->owner);
+               if (owner && !six_spin_on_owner(lock, owner, end_time))
+                       break;
+
+               if (do_six_trylock_type(lock, type, false)) {
+                       osq_unlock(&lock->osq);
+                       preempt_enable();
+                       return true;
+               }
+
                /*
-                * If we're an RT task that will live-lock because we won't let
+                * When there's no owner, we might have preempted between the
+                * owner acquiring the lock and setting the owner field. If
+                * we're an RT task that will live-lock because we won't let
                 * the owner complete.
                 */
-               if (wait->lock_acquired ||
-                   !owner->on_cpu ||
-                   rt_task(task) ||
-                   need_resched())
+               if (!owner && (need_resched() || rt_task(task)))
                        break;
 
+               /*
+                * The cpu_relax() call is a compiler barrier which forces
+                * everything in this loop to be re-loaded. We don't need
+                * memory barriers as we'll eventually observe the right
+                * values at the cost of a few extra spins.
+                */
                cpu_relax();
        }
-       rcu_read_unlock();
 
-       return wait->lock_acquired;
+       osq_unlock(&lock->osq);
+fail:
+       preempt_enable();
+
+       /*
+        * If we fell out of the spin path because of need_resched(),
+        * reschedule now, before we try-lock again. This avoids getting
+        * scheduled out right after we obtained the lock.
+        */
+       if (need_resched())
+               schedule();
+
+       return false;
 }
 
 #else /* CONFIG_LOCK_SPIN_ON_OWNER */
 
-static inline bool six_optimistic_spin(struct six_lock *lock,
-                                      struct six_lock_waiter *wait)
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
 {
        return false;
 }
@@ -406,7 +488,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock,
 noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
                                    struct six_lock_waiter *wait,
-                                   six_lock_should_sleep_fn should_sleep_fn, void *p)
+                                   six_lock_should_sleep_fn should_sleep_fn, void *p,
+                                   unsigned long ip)
 {
        union six_lock_state old;
        int ret = 0;
@@ -417,7 +500,11 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                smp_mb__after_atomic();
        }
 
-       lock_contended(&lock->dep_map, _RET_IP_);
+       trace_contention_begin(lock, 0);
+       lock_contended(&lock->dep_map, ip);
+
+       if (six_optimistic_spin(lock, type))
+               goto out;
 
        wait->task              = current;
        wait->lock_want         = type;
@@ -457,9 +544,6 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                ret = 0;
        }
 
-       if (six_optimistic_spin(lock, wait))
-               goto out;
-
        while (1) {
                set_current_state(TASK_UNINTERRUPTIBLE);
 
@@ -488,6 +572,7 @@ out:
                                            &lock->state.counter);
                six_lock_wakeup(lock, old, SIX_LOCK_read);
        }
+       trace_contention_end(lock, 0);
 
        return ret;
 }
@@ -495,33 +580,35 @@ out:
 __always_inline __flatten
 static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
                         struct six_lock_waiter *wait,
-                        six_lock_should_sleep_fn should_sleep_fn, void *p)
+                        six_lock_should_sleep_fn should_sleep_fn, void *p,
+                        unsigned long ip)
 {
        int ret;
 
        wait->start_time = 0;
 
        if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
+               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
        ret = do_six_trylock_type(lock, type, true) ? 0
-               : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
+               : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
        if (ret && type != SIX_LOCK_write)
-               six_release(&lock->dep_map);
+               six_release(&lock->dep_map, ip);
        if (!ret)
-               lock_acquired(&lock->dep_map, _RET_IP_);
+               lock_acquired(&lock->dep_map, ip);
 
        return ret;
 }
 
 __always_inline
 static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                          six_lock_should_sleep_fn should_sleep_fn, void *p)
+                          six_lock_should_sleep_fn should_sleep_fn, void *p,
+                          unsigned long ip)
 {
        struct six_lock_waiter wait;
 
-       return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+       return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
 }
 
 __always_inline __flatten
@@ -540,16 +627,21 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
                smp_mb(); /* between unlocking and checking for waiters */
                state.v = READ_ONCE(lock->state.v);
        } else {
+               u64 v = l[type].unlock_val;
+
+               if (type != SIX_LOCK_read)
+                       v -= lock->state.v & __SIX_VAL(nospin, 1);
+
                EBUG_ON(!(lock->state.v & l[type].held_mask));
-               state.v = atomic64_add_return_release(l[type].unlock_val,
-                                                     &lock->state.counter);
+               state.v = atomic64_add_return_release(v, &lock->state.counter);
        }
 
        six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
 __always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
+                             unsigned long ip)
 {
        EBUG_ON(type == SIX_LOCK_write &&
                !(lock->state.v & __SIX_LOCK_HELD_intent));
@@ -558,7 +650,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
                lock->owner != current);
 
        if (type != SIX_LOCK_write)
-               six_release(&lock->dep_map);
+               six_release(&lock->dep_map, ip);
 
        if (type == SIX_LOCK_intent &&
            lock->intent_lock_recurse) {
@@ -570,38 +662,40 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 }
 
 #define __SIX_LOCK(type)                                               \
-bool six_trylock_##type(struct six_lock *lock)                         \
+bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)    \
 {                                                                      \
-       return __six_trylock_type(lock, SIX_LOCK_##type);               \
+       return __six_trylock_type(lock, SIX_LOCK_##type, ip);           \
 }                                                                      \
-EXPORT_SYMBOL_GPL(six_trylock_##type);                                 \
+EXPORT_SYMBOL_GPL(six_trylock_ip_##type);                              \
                                                                        \
-bool six_relock_##type(struct six_lock *lock, u32 seq)                 \
+bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
 {                                                                      \
-       return __six_relock_type(lock, SIX_LOCK_##type, seq);           \
+       return __six_relock_type(lock, SIX_LOCK_##type, seq, ip);       \
 }                                                                      \
-EXPORT_SYMBOL_GPL(six_relock_##type);                                  \
+EXPORT_SYMBOL_GPL(six_relock_ip_##type);                               \
                                                                        \
-int six_lock_##type(struct six_lock *lock,                             \
-                   six_lock_should_sleep_fn should_sleep_fn, void *p)  \
+int six_lock_ip_##type(struct six_lock *lock,                          \
+                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
+                   unsigned long ip)                                   \
 {                                                                      \
-       return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+       return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
 }                                                                      \
-EXPORT_SYMBOL_GPL(six_lock_##type);                                    \
+EXPORT_SYMBOL_GPL(six_lock_ip_##type);                                 \
                                                                        \
-int six_lock_waiter_##type(struct six_lock *lock,                      \
+int six_lock_ip_waiter_##type(struct six_lock *lock,                   \
                           struct six_lock_waiter *wait,                \
-                          six_lock_should_sleep_fn should_sleep_fn, void *p)\
+                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
+                          unsigned long ip)                            \
 {                                                                      \
-       return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+       return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
 }                                                                      \
-EXPORT_SYMBOL_GPL(six_lock_waiter_##type);                             \
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type);                          \
                                                                        \
-void six_unlock_##type(struct six_lock *lock)                          \
+void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)     \
 {                                                                      \
-       __six_unlock_type(lock, SIX_LOCK_##type);                       \
+       __six_unlock_type(lock, SIX_LOCK_##type, ip);                   \
 }                                                                      \
-EXPORT_SYMBOL_GPL(six_unlock_##type);
+EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
 
 __SIX_LOCK(read)
 __SIX_LOCK(intent)
@@ -672,7 +766,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
 
-       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
+       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
 
        /* XXX: assert already locked, and that we don't overflow: */
 
index 29c498ad9c9c6ca6dcf258dcea0f3f8e14eec0a2..0810ca132bd5c3ab65a918dac5a596b207f41245 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/limits.h>
-#include <linux/printbuf.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>