]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 50847e296b34 bcachefs: Check subvol <-> inode pointers...
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 6 Feb 2024 04:09:25 +0000 (23:09 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Tue, 6 Feb 2024 06:07:16 +0000 (01:07 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
85 files changed:
.bcachefs_revision
Makefile
c_src/cmd_format.c
c_src/cmd_fs.c
c_src/tools-util.h
include/linux/darray.h [moved from libbcachefs/darray.h with 66% similarity]
include/linux/darray_types.h [new file with mode: 0644]
include/linux/eytzinger.h [moved from libbcachefs/eytzinger.h with 78% similarity]
include/linux/mean_and_variance.h [moved from libbcachefs/mean_and_variance.h with 94% similarity]
include/linux/mempool.h
include/linux/spinlock.h
include/linux/spinlock_types.h [new file with mode: 0644]
include/linux/thread_with_file.h [new file with mode: 0644]
include/linux/thread_with_file_types.h [new file with mode: 0644]
include/linux/time.h [new file with mode: 0644]
include/linux/time_stats.h [new file with mode: 0644]
include/linux/types.h
libbcachefs/alloc_foreground.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_journal_iter.c
libbcachefs/btree_journal_iter.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.c
libbcachefs/btree_update_interior.c
libbcachefs/btree_write_buffer_types.h
libbcachefs/buckets.c
libbcachefs/chardev.c
libbcachefs/compress.c
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/ec.c
libbcachefs/errcode.h
libbcachefs/error.c
libbcachefs/fifo.h
libbcachefs/fs-common.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io_read.c
libbcachefs/io_write.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_sb.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/nocow_locking.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/sb-clean.c
libbcachefs/sb-downgrade.c
libbcachefs/sb-errors_types.h
libbcachefs/sb-members.h
libbcachefs/str_hash.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/subvolume_types.h
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/thread_with_file.c [deleted file]
libbcachefs/thread_with_file.h [deleted file]
libbcachefs/thread_with_file_types.h [deleted file]
libbcachefs/util.c
libbcachefs/util.h
linux/darray.c [moved from libbcachefs/darray.c with 68% similarity]
linux/mean_and_variance.c [moved from libbcachefs/mean_and_variance.c with 93% similarity]
linux/mempool.c
linux/sort.c [new file with mode: 0644]
linux/time_stats.c [new file with mode: 0644]
src/wrappers/handle.rs

index 797e13368b786a0eb6037554f2e68b80d2f68c9e..d3c500bc50343ac2b9dce45aa7b5f7a9c93ede57 100644 (file)
@@ -1 +1 @@
-481b5f34324809f47a58ed798d038fb17e5b7b0a
+50847e296b34efabe199e408ec4d72f10a866c39
index 5cdb437cc7f1e2f4dbdbccf929386707890eb183..876efbb7bc32f6dedc0d7804297dd976a7d3e0f1 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -273,11 +273,20 @@ update-bcachefs-sources:
        git add include/linux/kmemleak.h
        cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
        git add linux/int_sqrt.c
-       git rm -f libbcachefs/mean_and_variance_test.c
-#      cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
-#      git add linux/mean_and_variance.c
-#      cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
-#      git add include/linux/mean_and_variance.h
+       cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
+       git add linux/mean_and_variance.c
+       cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
+       git add include/linux/mean_and_variance.h
+       cp $(LINUX_DIR)/lib/time_stats.c linux/
+       git add linux/time_stats.c
+       cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/
+       git add include/linux/time_stats.h
+       cp $(LINUX_DIR)/include/linux/darray.h include/linux/
+       git add include/linux/darray.h
+       cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/
+       git add include/linux/darray_types.h
+       cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/
+       git add include/linux/eytzinger.h
        cp $(LINUX_DIR)/scripts/Makefile.compiler ./
        git add Makefile.compiler
        $(RM) libbcachefs/*.mod.c
index 6b77763e2ff921a8f4d57d8992fb57c281f741f5..3d29f41322f13d9a523be56e290e949062391144 100644 (file)
 #include "cmds.h"
 #include "libbcachefs.h"
 #include "crypto.h"
-#include "libbcachefs/darray.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 #include "libbcachefs/util.h"
 
+#include "linux/darray.h"
+
 #define OPTS                                           \
 x(0,   replicas,               required_argument)      \
 x(0,   encrypted,              no_argument)            \
index 1a5d144b6d8c3e4927a1aa572e097d160cf8aa8b..d8542d61ae53d37ffa3d58aa343b90b5549c2d0a 100644 (file)
@@ -9,13 +9,14 @@
 
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/buckets.h"
-#include "libbcachefs/darray.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 
 #include "cmds.h"
 #include "libbcachefs.h"
 
+#include "linux/darray.h"
+
 static void __dev_usage_type_to_text(struct printbuf *out,
                                     enum bch_data_type type,
                                     unsigned bucket_size,
index bff3bc65eb7eca8c8cf2837d6700a9630751052c..4682406ee96f2b4b01c9e916a4b8254011ae748d 100644 (file)
@@ -20,7 +20,7 @@
 #include <linux/uuid.h>
 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/bbpos.h"
-#include "libbcachefs/darray.h"
+#include "linux/darray.h"
 
 #define noreturn __attribute__((noreturn))
 
similarity index 66%
rename from libbcachefs/darray.h
rename to include/linux/darray.h
index 4b340d13caace03b12f75e788316ad5af7e08d1c..ff167eb795f22e119c5586ef28e7017753d2bf69 100644 (file)
@@ -1,34 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_H
+#define _LINUX_DARRAY_H
 
 /*
- * Dynamic arrays:
+ * Dynamic arrays
  *
  * Inspired by CCAN's darray
  */
 
+#include <linux/darray_types.h>
 #include <linux/slab.h>
 
-#define DARRAY_PREALLOCATED(_type, _nr)                                        \
-struct {                                                               \
-       size_t nr, size;                                                \
-       _type *data;                                                    \
-       _type preallocated[_nr];                                        \
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)   darray_char;
-typedef DARRAY(char *) darray_str;
-
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
 
 static inline int __darray_resize(darray_char *d, size_t element_size,
                                  size_t new_size, gfp_t gfp)
 {
        return unlikely(new_size > d->size)
-               ? __bch2_darray_resize(d, element_size, new_size, gfp)
+               ? __darray_resize_slowpath(d, element_size, new_size, gfp)
                : 0;
 }
 
@@ -69,6 +61,28 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 #define darray_first(_d)       ((_d).data[0])
 #define darray_last(_d)                ((_d).data[(_d).nr - 1])
 
+/* Insert/remove items into the middle of a darray: */
+
+#define array_insert_item(_array, _nr, _pos, _new_item)                        \
+do {                                                                   \
+       memmove(&(_array)[(_pos) + 1],                                  \
+               &(_array)[(_pos)],                                      \
+               sizeof((_array)[0]) * ((_nr) - (_pos)));                \
+       (_nr)++;                                                        \
+       (_array)[(_pos)] = (_new_item);                                 \
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)           \
+do {                                                                   \
+       (_nr) -= (_nr_to_remove);                                       \
+       memmove(&(_array)[(_pos)],                                      \
+               &(_array)[(_pos) + (_nr_to_remove)],                    \
+               sizeof((_array)[0]) * ((_nr) - (_pos)));                \
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)                           \
+       array_remove_items(_array, _nr, _pos, 1)
+
 #define darray_insert_item(_d, pos, _item)                             \
 ({                                                                     \
        size_t _pos = (pos);                                            \
@@ -79,10 +93,15 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
        _ret;                                                           \
 })
 
+#define darray_remove_items(_d, _pos, _nr_to_remove)                   \
+       array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
+
 #define darray_remove_item(_d, _pos)                                   \
-       array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+       darray_remove_items(_d, _pos, 1)
+
+/* Iteration: */
 
-#define __darray_for_each(_d, _i)                                              \
+#define __darray_for_each(_d, _i)                                      \
        for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each(_d, _i)                                                \
@@ -106,4 +125,4 @@ do {                                                                        \
        darray_init(_d);                                                \
 } while (0)
 
-#endif /* _BCACHEFS_DARRAY_H */
+#endif /* _LINUX_DARRAY_H */
diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h
new file mode 100644 (file)
index 0000000..a400a0c
--- /dev/null
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_TYpES_H
+#define _LINUX_DARRAY_TYpES_H
+
+#include <linux/types.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr)                                        \
+struct {                                                               \
+       size_t nr, size;                                                \
+       _type *data;                                                    \
+       _type preallocated[_nr];                                        \
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)   darray_char;
+typedef DARRAY(char *) darray_str;
+
+#endif /* _LINUX_DARRAY_TYpES_H */
similarity index 78%
rename from libbcachefs/eytzinger.h
rename to include/linux/eytzinger.h
index b04750dbf870bc78c95ece35d363e3a4c0936b50..9565a5c26cd505584b3872d2b8898dfd2998e982 100644 (file)
@@ -1,27 +1,37 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
+#ifndef _LINUX_EYTZINGER_H
+#define _LINUX_EYTZINGER_H
 
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond)         BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
  *
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-       EBUG_ON(child > 1);
+       EYTZINGER_BUG_ON(child > 1);
 
        return (i << 1) + child;
 }
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-       EBUG_ON(i > size);
+       EYTZINGER_BUG_ON(i > size);
 
        if (eytzinger1_right_child(i) <= size) {
                i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-       EBUG_ON(i > size);
+       EYTZINGER_BUG_ON(i > size);
 
        if (eytzinger1_left_child(i) <= size) {
                i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
        unsigned shift = __fls(size) - b;
        int s;
 
-       EBUG_ON(!i || i > size);
+       EYTZINGER_BUG_ON(!i || i > size);
 
        i  ^= 1U << b;
        i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
        unsigned shift;
        int s;
 
-       EBUG_ON(!i || i > size);
+       EYTZINGER_BUG_ON(!i || i > size);
 
        /*
         * sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-       EBUG_ON(child > 1);
+       EYTZINGER_BUG_ON(child > 1);
 
        return (i << 1) + 1 + child;
 }
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
             (_i) != -1;                                \
             (_i) = eytzinger0_next((_i), (_size)))
 
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
 /* return greatest node <= @search, or -1 if not found */
 static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-                                        eytzinger_cmp_fn cmp, const void *search)
+                                        cmp_func_t cmp, const void *search)
 {
        unsigned i, n = 0;
 
@@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
        do {
                i = n;
-               n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+               n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
        } while (n < nr);
 
        if (n & 1) {
@@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
        _i;                                                             \
 })
 
-void eytzinger0_sort(void *, size_t, size_t,
-                   int (*cmp_func)(const void *, const void *, size_t),
-                   void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+                      cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
 
-#endif /* _EYTZINGER_H */
+#endif /* _LINUX_EYTZINGER_H */
similarity index 94%
rename from libbcachefs/mean_and_variance.h
rename to include/linux/mean_and_variance.h
index b2be565bb8f214bc2ac4ebd6efac324ac20b7241..4fcf062dd22c71efd338a9ab37a8e1af5ca22ce5 100644 (file)
@@ -17,7 +17,7 @@
  * Rust and rustc has issues with u128.
  */
 
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
 
 typedef struct {
        unsigned __int128 v;
@@ -154,8 +154,6 @@ struct mean_and_variance {
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-       bool    init;
-       u8      weight; /* base 2 logarithim */
        s64     mean;
        u64     variance;
 };
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
 u64 mean_and_variance_get_variance(struct mean_and_variance s1);
 u32 mean_and_variance_get_stddev(struct mean_and_variance s);
 
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+               s64 v, bool initted, u8 weight);
 
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+               u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+               u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+               u8 weight);
 
 #endif // MEAN_AND_VAIRANCE_H_
index 506da24d6d594f8d6dbd0f66052d2bfd497f3d09..373251708474e2aecea1342dea065f93cc11ff93 100644 (file)
@@ -90,6 +90,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
                              (void *) size);
 }
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kvfree(void *element, void *pool_data);
+
+static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+       return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
+static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
+{
+       return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
index 6c4a623c267182d6362ce6a9a1843c230a67fdad..28ce667b56789ae35f90ffc226a64ab6f9d6def4 100644 (file)
@@ -1,65 +1 @@
-#ifndef __TOOLS_LINUX_SPINLOCK_H
-#define __TOOLS_LINUX_SPINLOCK_H
-
-#include <linux/atomic.h>
-#include <pthread.h>
-
-typedef struct {
-       pthread_mutex_t lock;
-} raw_spinlock_t;
-
-#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
-
-static inline void raw_spin_lock_init(raw_spinlock_t *lock)
-{
-       pthread_mutex_init(&lock->lock, NULL);
-}
-
-static inline bool raw_spin_trylock(raw_spinlock_t *lock)
-{
-       return !pthread_mutex_trylock(&lock->lock);
-}
-
-static inline void raw_spin_lock(raw_spinlock_t *lock)
-{
-       pthread_mutex_lock(&lock->lock);
-}
-
-static inline void raw_spin_unlock(raw_spinlock_t *lock)
-{
-       pthread_mutex_unlock(&lock->lock);
-}
-
-#define raw_spin_lock_irq(lock)                raw_spin_lock(lock)
-#define raw_spin_unlock_irq(lock)      raw_spin_unlock(lock)
-
-#define raw_spin_lock_irqsave(lock, flags)             \
-do {                                                   \
-       flags = 0;                                      \
-       raw_spin_lock(lock);                            \
-} while (0)
-
-#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
-
-typedef raw_spinlock_t spinlock_t;
-
-#define __SPIN_LOCK_UNLOCKED(name)     __RAW_SPIN_LOCK_UNLOCKED(name)
-
-#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
-
-#define spin_lock_init(lock)           raw_spin_lock_init(lock)
-#define spin_lock(lock)                        raw_spin_lock(lock)
-#define spin_unlock(lock)              raw_spin_unlock(lock)
-
-#define spin_lock_nested(lock, n)      spin_lock(lock)
-
-#define spin_lock_bh(lock)             raw_spin_lock(lock)
-#define spin_unlock_bh(lock)           raw_spin_unlock(lock)
-
-#define spin_lock_irq(lock)            raw_spin_lock(lock)
-#define spin_unlock_irq(lock)          raw_spin_unlock(lock)
-
-#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags)
-#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
-
-#endif /* __TOOLS_LINUX_SPINLOCK_H */
+#include "linux/spinlock_types.h"
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
new file mode 100644 (file)
index 0000000..6c4a623
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef __TOOLS_LINUX_SPINLOCK_H
+#define __TOOLS_LINUX_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <pthread.h>
+
+typedef struct {
+       pthread_mutex_t lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
+
+static inline void raw_spin_lock_init(raw_spinlock_t *lock)
+{
+       pthread_mutex_init(&lock->lock, NULL);
+}
+
+static inline bool raw_spin_trylock(raw_spinlock_t *lock)
+{
+       return !pthread_mutex_trylock(&lock->lock);
+}
+
+static inline void raw_spin_lock(raw_spinlock_t *lock)
+{
+       pthread_mutex_lock(&lock->lock);
+}
+
+static inline void raw_spin_unlock(raw_spinlock_t *lock)
+{
+       pthread_mutex_unlock(&lock->lock);
+}
+
+#define raw_spin_lock_irq(lock)                raw_spin_lock(lock)
+#define raw_spin_unlock_irq(lock)      raw_spin_unlock(lock)
+
+#define raw_spin_lock_irqsave(lock, flags)             \
+do {                                                   \
+       flags = 0;                                      \
+       raw_spin_lock(lock);                            \
+} while (0)
+
+#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
+
+typedef raw_spinlock_t spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED(name)     __RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#define spin_lock_init(lock)           raw_spin_lock_init(lock)
+#define spin_lock(lock)                        raw_spin_lock(lock)
+#define spin_unlock(lock)              raw_spin_unlock(lock)
+
+#define spin_lock_nested(lock, n)      spin_lock(lock)
+
+#define spin_lock_bh(lock)             raw_spin_lock(lock)
+#define spin_unlock_bh(lock)           raw_spin_unlock(lock)
+
+#define spin_lock_irq(lock)            raw_spin_lock(lock)
+#define spin_unlock_irq(lock)          raw_spin_unlock(lock)
+
+#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags)
+#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
+
+#endif /* __TOOLS_LINUX_SPINLOCK_H */
diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
new file mode 100644 (file)
index 0000000..2a66e76
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_THREAD_WITH_FILE_H
+#define _LINUX_THREAD_WITH_FILE_H
+
+struct stdio_redirect;
+
+__printf(3, 0)
+static inline void stdio_redirect_vprintf(struct stdio_redirect *s, bool nonblocking, const char *msg, va_list args) {}
+__printf(3, 4)
+static inline void stdio_redirect_printf(struct stdio_redirect *s, bool nonblocking, const char *msg, ...) {}
+
+#endif /* _LINUX_THREAD_WITH_FILE_H */
diff --git a/include/linux/thread_with_file_types.h b/include/linux/thread_with_file_types.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/include/linux/time.h b/include/linux/time.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644 (file)
index 0000000..6df2b34
--- /dev/null
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ *   everywhere without worrying about overhead
+ *
+ * tracks:
+ *  - number of events
+ *  - maximum event duration ever seen
+ *  - sum of all event durations
+ *  - average event duration, standard and weighted
+ *  - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _LINUX_TIME_STATS_H
+#define _LINUX_TIME_STATS_H
+
+#include <linux/mean_and_variance.h>
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+struct time_unit {
+       const char      *name;
+       u64             nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES   15
+#define QUANTILE_IDX(i)        inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST  eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+       struct quantile_entry {
+               u64     m;
+               u64     step;
+       }               entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+       unsigned        nr;
+       struct time_stat_buffer_entry {
+               u64     start;
+               u64     end;
+       }               entries[31];
+};
+
+struct time_stats {
+       spinlock_t      lock;
+       bool            have_quantiles;
+       /* all fields are in nanoseconds */
+       u64             min_duration;
+       u64             max_duration;
+       u64             total_duration;
+       u64             max_freq;
+       u64             min_freq;
+       u64             last_event;
+       u64             last_event_start;
+
+       struct mean_and_variance          duration_stats;
+       struct mean_and_variance          freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT   8
+
+       struct mean_and_variance_weighted duration_stats_weighted;
+       struct mean_and_variance_weighted freq_stats_weighted;
+       struct time_stat_buffer __percpu *buffer;
+
+       u64             start_time;
+};
+
+struct time_stats_quantiles {
+       struct time_stats       stats;
+       struct quantiles        quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
+{
+       return stats->have_quantiles
+               ? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
+               : NULL;
+}
+
+void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
+void __time_stats_update(struct time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats      - time_stats to update
+ * @start      - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void time_stats_update(struct time_stats *stats, u64 start)
+{
+       __time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats      - time_stats to update
+ * @v          - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct time_stats *stats, bool v)
+{
+       if (v != !!stats->last_event_start) {
+               if (!v) {
+                       time_stats_update(stats, stats->last_event_start);
+                       stats->last_event_start = 0;
+               } else {
+                       stats->last_event_start = local_clock() ?: 1;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+#define TIME_STATS_PRINT_NO_ZEROES     (1U << 0)       /* print nothing if zero count */
+struct seq_buf;
+void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
+               const char *epoch_name, unsigned int flags);
+void time_stats_to_json(struct seq_buf *, struct time_stats *,
+               const char *epoch_name, unsigned int flags);
+
+void time_stats_exit(struct time_stats *);
+void time_stats_init(struct time_stats *);
+
+static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
+{
+       time_stats_exit(&statq->stats);
+}
+static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
+{
+       time_stats_init(&statq->stats);
+       statq->stats.have_quantiles = true;
+       memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _LINUX_TIME_STATS_H */
index ce454e2661e9f70a1a5e53135f762751d454ab71..6ae97c421c3b870b6f6e48bf77b0a6ee68ae2ac8 100644 (file)
@@ -8,6 +8,7 @@
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <linux/posix_types.h>
 
 #define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
 #include <asm/types.h>
@@ -77,6 +78,10 @@ typedef __u64 __bitwise __be64;
 
 typedef u64 sector_t;
 
+typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
 typedef int (*cmp_func_t)(const void *a, const void *b);
 
 typedef unsigned int __bitwise slab_flags_t;
index 633d3223b353f83e83501601024dd262952236c6..ca58193dd90279b6d6081f06954690f214ba3a42 100644 (file)
@@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
                if (cl)
                        closure_wait(&c->open_buckets_wait, cl);
 
-               track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-                                  &c->blocked_allocate_open_bucket, true);
+               track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
                spin_unlock(&c->freelist_lock);
                return ERR_PTR(-BCH_ERR_open_buckets_empty);
        }
@@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
        ca->nr_open_buckets++;
        bch2_open_bucket_hash_add(c, ob);
 
-       track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-                          &c->blocked_allocate_open_bucket, false);
-
-       track_event_change(&c->times[BCH_TIME_blocked_allocate],
-                          &c->blocked_allocate, false);
+       track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+       track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
 
        spin_unlock(&c->freelist_lock);
        return ob;
@@ -555,8 +551,7 @@ again:
                        goto again;
                }
 
-               track_event_change(&c->times[BCH_TIME_blocked_allocate],
-                                  &c->blocked_allocate, true);
+               track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
 
                ob = ERR_PTR(-BCH_ERR_freelist_empty);
                goto err;
index b80c6c9efd8cef95b46b5b45b21f639e18373755..70369495be335f14c677d14fb05d8d6bb0e22a84 100644 (file)
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
+#include <linux/thread_with_file_types.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -465,7 +467,6 @@ enum bch_time_stats {
 #include "replicas_types.h"
 #include "subvolume_types.h"
 #include "super_types.h"
-#include "thread_with_file_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES         4U
@@ -593,7 +594,7 @@ struct bch_dev {
 
        /* The rest of this all shows up in sysfs */
        atomic64_t              cur_latency[2];
-       struct bch2_time_stats  io_latency[2];
+       struct time_stats_quantiles     io_latency[2];
 
 #define CONGESTED_MAX          1024
        atomic_t                congested;
@@ -640,8 +641,8 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
-       struct bch2_time_stats  duration;
-       struct bch2_time_stats  lock_hold_times;
+       struct time_stats       duration;
+       struct time_stats       lock_hold_times;
        struct mutex            lock;
        unsigned                nr_max_paths;
        unsigned                journal_entries_size;
@@ -919,8 +920,6 @@ struct bch_fs {
        /* ALLOCATOR */
        spinlock_t              freelist_lock;
        struct closure_waitlist freelist_wait;
-       u64                     blocked_allocate;
-       u64                     blocked_allocate_open_bucket;
 
        open_bucket_idx_t       open_buckets_freelist;
        open_bucket_idx_t       open_buckets_nr_free;
@@ -1104,7 +1103,7 @@ struct bch_fs {
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
-       struct bch2_time_stats  times[BCH_TIME_STAT_NR];
+       struct time_stats       times[BCH_TIME_STAT_NR];
 
        struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 
index 0668b682a21ca8e035cae73f73e6774c99eaeb94..14f613617913e1a3ef0e93c51caa041041f822c2 100644 (file)
@@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(dev_usage,            8)              \
        x(log,                  9)              \
        x(overwrite,            10)             \
-       x(write_buffer_keys,    11)
+       x(write_buffer_keys,    11)             \
+       x(datetime,             12)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1376,6 +1377,11 @@ struct jset_entry_log {
        u8                      d[];
 } __packed __aligned(8);
 
+struct jset_entry_datetime {
+       struct jset_entry       entry;
+       __le64                  seconds;
+} __packed __aligned(8);
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
index cbad7a9e455156c5dfc08197971b857114854658..4b8fba754b1c13f069bb6d1a199d94e591a62668 100644 (file)
@@ -379,7 +379,7 @@ struct bch_ioctl_disk_resize_journal {
 
 struct bch_ioctl_subvolume {
        __u32                   flags;
-       __s32                   dirfd;
+       __u32                   dirfd;
        __u16                   mode;
        __u16                   pad[3];
        __u64                   dst_ptr;
index 3fd1085b6c61ee72e7e814cf722306ebdba057c4..1d77aa55d641c66d86e29ca19162acb1afb44a70 100644 (file)
@@ -9,12 +9,12 @@
 #include "bcachefs.h"
 #include "btree_cache.h"
 #include "bset.h"
-#include "eytzinger.h"
 #include "trace.h"
 #include "util.h"
 
 #include <asm/unaligned.h>
 #include <linux/console.h>
+#include <linux/eytzinger.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
index d7c81beac14afae7ee44f11f28eb424f1b54a063..9b7ea1227069e6d73d53ef15fa0d1ee3afaadd5e 100644 (file)
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
        clear_btree_node_just_written(b);
 
-       kvpfree(b->data, btree_buf_bytes(b));
+       kvfree(b->data);
        b->data = NULL;
 #ifdef __KERNEL__
        kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
        BUG_ON(b->data || b->aux_data);
 
-       b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+       b->data = kvmalloc(btree_buf_bytes(b), gfp);
        if (!b->data)
                return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
                b->aux_data = NULL;
 #endif
        if (!b->aux_data) {
-               kvpfree(b->data, btree_buf_bytes(b));
+               kvfree(b->data);
                b->data = NULL;
                return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
        }
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        if (c->verify_data)
                list_move(&c->verify_data->list, &bc->live);
 
-       kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+       kvfree(c->verify_ondisk);
 
        for (i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
@@ -648,7 +648,7 @@ out:
        bch2_btree_keys_init(b);
        set_btree_node_accessed(b);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+       time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
                               start_time);
 
        memalloc_nofs_restore(flags);
@@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
        b = bch2_btree_node_mem_alloc(trans, level != 0);
 
        if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+               if (!path)
+                       return b;
+
                trans->memory_allocation_failure = true;
                trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
                return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
        }
 
        if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-               if (path)
-                       trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+               BUG_ON(!path);
+
+               trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
                return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
        }
 
@@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
-       BUG_ON(trans && !btree_node_locked(path, level + 1));
+       BUG_ON(path && !btree_node_locked(path, level + 1));
        BUG_ON(level >= BTREE_MAX_DEPTH);
 
        b = btree_cache_find(bc, k);
index 1102995643b137c3a8a9fe5f12f0cce95edfafeb..eb92526bb9b64cee6468f3b35a908e9807d85403 100644 (file)
@@ -389,7 +389,8 @@ again:
        have_child = dropped_children = false;
        bch2_bkey_buf_init(&prev_k);
        bch2_bkey_buf_init(&cur_k);
-       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+       iter.prefetch = true;
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -478,7 +479,8 @@ again:
                goto err;
 
        bch2_btree_and_journal_iter_exit(&iter);
-       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+       iter.prefetch = true;
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -931,7 +933,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
-       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
        bch2_bkey_buf_init(&prev);
        bch2_bkey_buf_init(&cur);
        bkey_init(&prev.k->k);
@@ -963,7 +965,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 
        if (b->c.level > target_depth) {
                bch2_btree_and_journal_iter_exit(&iter);
-               bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+               bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+               iter.prefetch = true;
 
                while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                        struct btree *child;
@@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
        genradix_free(&c->gc_stripes);
 
        for_each_member_device(c, ca) {
-               kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
-                       sizeof(struct bucket_array) +
-                       ca->mi.nbuckets * sizeof(struct bucket));
+               kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
                ca->buckets_gc = NULL;
 
                free_percpu(ca->usage_gc);
@@ -1491,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
        for_each_member_device(c, ca) {
-               struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+               struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
                                ca->mi.nbuckets * sizeof(struct bucket),
                                GFP_KERNEL|__GFP_ZERO);
                if (!buckets) {
@@ -1970,7 +1971,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
        c->gc_count++;
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+       time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
        trace_and_count(c, gc_gens_end, c);
 err:
        for_each_member_device(c, ca) {
index aa9b6cbe3226909626411b886731a8bb8648a558..61b6093805eaf2fc433e49dec5d4ad7228b352d7 100644 (file)
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
        if (used_mempool)
                mempool_free(p, &c->btree_bounce_pool);
        else
-               vpfree(p, size);
+               kvfree(p);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
        BUG_ON(size > c->opts.btree_node_size);
 
        *used_mempool = false;
-       p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+       p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
        if (!p) {
                *used_mempool = true;
                p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
        if (sorting_entire_node)
-               bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+               time_stats_update(&c->times[BCH_TIME_btree_node_sort],
                                       start_time);
 
        /* Make sure we preserve bset journal_seq: */
@@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
                        &dst->format,
                        true);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+       time_stats_update(&c->times[BCH_TIME_btree_node_sort],
                               start_time);
 
        set_btree_bset_end(dst, dst->set);
@@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 out:
        mempool_free(iter, &c->fill_iter);
        printbuf_exit(&buf);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
+       time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
        return retry_read;
 fsck_err:
        if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1323,7 +1323,7 @@ start:
                }
        }
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+       time_stats_update(&c->times[BCH_TIME_btree_node_read],
                               rb->start_time);
        bio_put(&rb->bio);
 
index 5467a8635be113102c56bb6f02986209533c35ac..3aac6ed5446ebd8d322d37d67276b41215150a36 100644 (file)
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret = 0;
 
-       __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+       __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
 
        k = bch2_btree_and_journal_iter_peek(&jiter);
 
@@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
        path = &trans->paths[path_idx];
 
        if (unlikely(path->level >= BTREE_MAX_DEPTH))
-               goto out;
+               goto out_uptodate;
 
        path->level = btree_path_up_until_good_node(trans, path, 0);
 
@@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
                        goto out;
                }
        }
-
+out_uptodate:
        path->uptodate = BTREE_ITER_UPTODATE;
 out:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
        if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
            time_after64(now, trans->last_begin_time + 10))
-               __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+               __time_stats_update(&btree_trans_stats(trans)->duration,
                                         trans->last_begin_time, now);
 
        if (!trans->restarted &&
@@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
             s++) {
                kfree(s->max_paths_text);
-               bch2_time_stats_exit(&s->lock_hold_times);
+               time_stats_exit(&s->lock_hold_times);
        }
 
        if (c->btree_trans_barrier_initialized)
@@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
             s++) {
-               bch2_time_stats_init(&s->duration);
-               bch2_time_stats_init(&s->lock_hold_times);
+               time_stats_init(&s->duration);
+               time_stats_init(&s->lock_hold_times);
                mutex_init(&s->lock);
        }
 
index 719a94a84950b7fe2d179b4860c2eed727044417..3da65562fdb0423ab0cfcba0fdf3d5cff40b74fa 100644 (file)
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bset.h"
+#include "btree_cache.h"
 #include "btree_journal_iter.h"
 #include "journal_io.h"
 
@@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
                iter->pos = bpos_successor(iter->pos);
 }
 
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+       struct btree_and_journal_iter iter = *_iter;
+       struct bch_fs *c = iter.trans->c;
+       unsigned level = iter.journal.level;
+       struct bkey_buf tmp;
+       unsigned nr = test_bit(BCH_FS_started, &c->flags)
+               ? (level > 1 ? 0 :  2)
+               : (level > 1 ? 1 : 16);
+
+       iter.prefetch = false;
+       bch2_bkey_buf_init(&tmp);
+
+       while (nr--) {
+               bch2_btree_and_journal_iter_advance(&iter);
+               struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+               if (!k.k)
+                       break;
+
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+               bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+       }
+
+       bch2_bkey_buf_exit(&tmp, c);
+}
+
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
        struct bkey_s_c btree_k, journal_k, ret;
+
+       if (iter->prefetch && iter->journal.level)
+               btree_and_journal_iter_prefetch(iter);
 again:
        if (iter->at_end)
                return bkey_s_c_null;
@@ -376,17 +407,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
        bch2_journal_iter_exit(&iter->journal);
 }
 
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                                 struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+                                                 struct btree_and_journal_iter *iter,
                                                  struct btree *b,
                                                  struct btree_node_iter node_iter,
                                                  struct bpos pos)
 {
        memset(iter, 0, sizeof(*iter));
 
+       iter->trans = trans;
        iter->b = b;
        iter->node_iter = node_iter;
-       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+       bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
        INIT_LIST_HEAD(&iter->journal.list);
        iter->pos = b->data->min_key;
        iter->at_end = false;
@@ -396,15 +428,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
  * this version is used by btree_gc before filesystem has gone RW and
  * multithreaded, so uses the journal_iters list:
  */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+                                               struct btree_and_journal_iter *iter,
                                                struct btree *b)
 {
        struct btree_node_iter node_iter;
 
        bch2_btree_node_iter_init_from_start(&node_iter, b);
-       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-       list_add(&iter->journal.list, &c->journal_iters);
+       __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
+       list_add(&iter->journal.list, &trans->c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -415,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
        struct genradix_iter iter;
 
        genradix_for_each(&c->journal_entries, iter, i)
-               if (*i)
-                       kvpfree(*i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&(*i)->j));
+               kvfree(*i);
        genradix_free(&c->journal_entries);
 }
 
index 8ca4c100b2e3e413d7adbb8dd5599d9f42de6d30..c9d19da3ea04803a360a683fa0e01a2838f2433f 100644 (file)
@@ -15,6 +15,7 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
+       struct btree_trans      *trans;
        struct btree            *b;
        struct btree_node_iter  node_iter;
        struct bkey             unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
        struct journal_iter     journal;
        struct bpos             pos;
        bool                    at_end;
+       bool                    prefetch;
 };
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
                                           unsigned, struct bpos);
 
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+                                        struct btree_and_journal_iter *);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
                                 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                               struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+                               struct btree_and_journal_iter *, struct btree *,
                                struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                                               struct bch_fs *,
-                                               struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+                               struct btree_and_journal_iter *, struct btree *);
 
 void bch2_journal_keys_put(struct bch_fs *);
 
index 4bd72c855da1a4028106b70e10727ad07d578614..f2e2c5881b7e4bd2551018708d7bcadee1d53083 100644 (file)
@@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
                                              struct btree_path *path, unsigned level)
 {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-       __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+       __time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
                                 path->l[level].lock_taken_time,
                                 local_clock());
 #endif
index 4a5a64499eb76698743ae7f20b4e47eaca09b868..0d5eecbd3e9cfb92d95ff9215ad0710bae7fb54b 100644 (file)
@@ -2,12 +2,12 @@
 #ifndef _BCACHEFS_BTREE_TYPES_H
 #define _BCACHEFS_BTREE_TYPES_H
 
+#include <linux/darray_types.h>
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
-#include "darray.h"
 #include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
index c3ff365acce9afeae894c69003d247bef9c8e955..e5193116b092f6b7120ac2f7e3c16f09846f2f59 100644 (file)
@@ -14,6 +14,8 @@
 #include "snapshot.h"
 #include "trace.h"
 
+#include <linux/darray.h>
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                                         const struct btree_insert_entry *r)
 {
index 17a5938aa71a6b43b45c12383e4690df146ee2a3..030291cc8f97dd57a0f9c9f3c9aaf9b94ac57213 100644 (file)
@@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
        bch2_disk_reservation_put(c, &as->disk_res);
        bch2_btree_reserve_put(as, trans);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+       time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
                               as->start_time);
 
        mutex_lock(&c->btree_interior_update_lock);
@@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
        continue_at(&as->cl, btree_update_set_nodes_written,
                    as->c->btree_interior_update_worker);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+       time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
                               start_time);
 }
 
@@ -1629,7 +1629,7 @@ out:
 
        bch2_trans_verify_locks(trans);
 
-       bch2_time_stats_update(&c->times[n2
+       time_stats_update(&c->times[n2
                               ? BCH_TIME_btree_node_split
                               : BCH_TIME_btree_node_compact],
                               start_time);
@@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        bch2_btree_update_done(as, trans);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+       time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
        if (new_path)
@@ -2484,7 +2484,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 {
        c->btree_interior_update_worker =
-               alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+               alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
        if (!c->btree_interior_update_worker)
                return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 
index 9b9433de9c3686aa59255858e44411384219bafc..5f248873087c304eba435a263049cbf6af407ce5 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 #include "journal_types.h"
 
 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX       4
index 54f7826ac49874d46b08330678ea0b2565ecc491..7dca10ba70d253fe1e0619e738ea7826d1ea1ca1 100644 (file)
@@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
        struct bucket_gens *buckets =
                container_of(rcu, struct bucket_gens, rcu);
 
-       kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+       kvfree(buckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        bool resize = ca->bucket_gens != NULL;
        int ret;
 
-       if (!(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
-                                           GFP_KERNEL|__GFP_ZERO))) {
+       if (!(bucket_gens       = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+                                          GFP_KERNEL|__GFP_ZERO))) {
                ret = -BCH_ERR_ENOMEM_bucket_gens;
                goto err;
        }
 
        if ((c->opts.buckets_nouse &&
-            !(buckets_nouse    = kvpmalloc(BITS_TO_LONGS(nbuckets) *
-                                           sizeof(unsigned long),
-                                           GFP_KERNEL|__GFP_ZERO)))) {
+            !(buckets_nouse    = kvmalloc(BITS_TO_LONGS(nbuckets) *
+                                          sizeof(unsigned long),
+                                          GFP_KERNEL|__GFP_ZERO)))) {
                ret = -BCH_ERR_ENOMEM_buckets_nouse;
                goto err;
        }
@@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
        ret = 0;
 err:
-       kvpfree(buckets_nouse,
-               BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+       kvfree(buckets_nouse);
        if (bucket_gens)
                call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 
@@ -1407,27 +1406,21 @@ err:
 
 void bch2_dev_buckets_free(struct bch_dev *ca)
 {
-       unsigned i;
-
-       kvpfree(ca->buckets_nouse,
-               BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-       kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
-               sizeof(struct bucket_gens) + ca->mi.nbuckets);
+       kvfree(ca->buckets_nouse);
+       kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
 
-       for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+       for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
                free_percpu(ca->usage[i]);
        kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       unsigned i;
-
        ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
        if (!ca->usage_base)
                return -BCH_ERR_ENOMEM_usage_init;
 
-       for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+       for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
                ca->usage[i] = alloc_percpu(struct bch_dev_usage);
                if (!ca->usage[i])
                        return -BCH_ERR_ENOMEM_usage_init;
index 226b39c176673a374f50ab06ad5f6d3e0a4858d8..4cbda66bb6e0fafe578d5d078d3bec5a73e98f24 100644 (file)
@@ -11,7 +11,6 @@
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
-#include "thread_with_file.h"
 
 #include <linux/cdev.h>
 #include <linux/device.h>
@@ -20,6 +19,7 @@
 #include <linux/major.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
+#include <linux/thread_with_file.h>
 #include <linux/uaccess.h>
 
 __must_check
@@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
        kfree(thr);
 }
 
-static int bch2_fsck_offline_thread_fn(void *arg)
+static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 {
-       struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+       struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
        struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
 
        thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
        if (!thr->thr.thr.ret)
                bch2_fs_stop(c);
-
-       thread_with_stdio_done(&thr->thr);
-       return 0;
 }
 
 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
@@ -220,7 +217,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
        opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-       ret = bch2_run_thread_with_stdio(&thr->thr,
+       ret = run_thread_with_stdio(&thr->thr,
                        bch2_fsck_thread_exit,
                        bch2_fsck_offline_thread_fn);
 err:
@@ -425,7 +422,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
 {
        struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 
-       bch2_thread_with_file_exit(&ctx->thr);
+       thread_with_file_exit(&ctx->thr);
        kfree(ctx);
        return 0;
 }
@@ -475,7 +472,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
        ctx->c = c;
        ctx->arg = arg;
 
-       ret = bch2_run_thread_with_file(&ctx->thr,
+       ret = run_thread_with_file(&ctx->thr,
                        &bcachefs_data_ops,
                        bch2_data_thread);
        if (ret < 0)
@@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
        return ret;
 }
 
-static int bch2_fsck_online_thread_fn(void *arg)
+static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
-       struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+       struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
        struct bch_fs *c = thr->c;
 
        c->stdio_filter = current;
@@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg)
        c->stdio_filter = NULL;
        c->opts.fix_errors = old_fix_errors;
 
-       thread_with_stdio_done(&thr->thr);
-
        up(&c->online_fsck_mutex);
        bch2_ro_ref_put(c);
-       return 0;
 }
 
 static long bch2_ioctl_fsck_online(struct bch_fs *c,
@@ -840,7 +834,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
                        goto err;
        }
 
-       ret = bch2_run_thread_with_stdio(&thr->thr,
+       ret = run_thread_with_stdio(&thr->thr,
                        bch2_fsck_thread_exit,
                        bch2_fsck_online_thread_fn);
 err:
index 33df8cf86bd8f83bbf42d45944d0632da404fd71..1410365a889156450c78da9165bdb146872370ed 100644 (file)
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                return 0;
 
        if (!mempool_initialized(&c->compression_bounce[READ]) &&
-           mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-                                       1, c->opts.encoded_extent_max))
+           mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+                                      1, c->opts.encoded_extent_max))
                return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
        if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-           mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-                                       1, c->opts.encoded_extent_max))
+           mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+                                      1, c->opts.encoded_extent_max))
                return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
        for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                if (mempool_initialized(&c->compress_workspace[i->type]))
                        continue;
 
-               if (mempool_init_kvpmalloc_pool(
+               if (mempool_init_kvmalloc_pool(
                                &c->compress_workspace[i->type],
                                1, i->compress_workspace))
                        return -BCH_ERR_ENOMEM_compression_workspace_init;
        }
 
        if (!mempool_initialized(&c->decompress_workspace) &&
-           mempool_init_kvpmalloc_pool(&c->decompress_workspace,
-                                       1, decompress_workspace_size))
+           mempool_init_kvmalloc_pool(&c->decompress_workspace,
+                                      1, decompress_workspace_size))
                return -BCH_ERR_ENOMEM_decompression_workspace_init;
 
        return 0;
index 7bdba8507fc93cdfdecc29de3e70e5589cf8177b..b1f147e6be4d5cdd0ab491932db9c625b763e29e 100644 (file)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        mutex_lock(&c->verify_lock);
 
        if (!c->verify_ondisk) {
-               c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+               c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
                if (!c->verify_ondisk)
                        goto out;
        }
@@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
                return;
        }
 
-       n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+       n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
        if (!n_ondisk) {
                prt_printf(out, "memory allocation failure\n");
                goto out;
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 out:
        if (bio)
                bio_put(bio);
-       kvpfree(n_ondisk, btree_buf_bytes(b));
+       kvfree(n_ondisk);
        percpu_ref_put(&ca->io_ref);
 }
 
index ae29ad0c63e57466dc4cb5eb75728cb589f43eec..97773cffccae8da7fd67c58c144463af479fc48d 100644 (file)
@@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
        dirent->k.p.inode       = dir;
        dirent->k.p.snapshot    = snapshot;
 
-       ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-                                    zero_inum, snapshot,
-                                    &dirent->k_i, str_hash_flags,
-                                    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+                                       zero_inum, snapshot,
+                                       &dirent->k_i, str_hash_flags,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
        *dir_offset = dirent->k.p.offset;
 
        return ret;
@@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
        struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
        struct bpos dst_pos =
                POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-       unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+       unsigned src_update_flags = 0;
+       bool delete_src, delete_dst;
        int ret = 0;
 
-       if (src_dir.subvol != dst_dir.subvol)
-               return -EXDEV;
-
        memset(src_inum, 0, sizeof(*src_inum));
        memset(dst_inum, 0, sizeof(*dst_inum));
 
@@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
        if (ret)
                goto out;
 
-       src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
-       if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
-               return -EOPNOTSUPP;
-
-
        /* Lookup dst: */
        if (mode == BCH_RENAME) {
                /*
@@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                bkey_s_c_to_dirent(old_dst), dst_inum);
                if (ret)
                        goto out;
-
-               dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
-               if (dst_type == DT_SUBVOL)
-                       return -EOPNOTSUPP;
        }
 
        if (mode != BCH_RENAME_EXCHANGE)
@@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
                }
        }
 
+       if (new_dst->v.d_type == DT_SUBVOL)
+               new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+       if ((mode == BCH_RENAME_EXCHANGE) &&
+           new_src->v.d_type == DT_SUBVOL)
+               new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
        ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
        if (ret)
                goto out;
 out_set_src:
-
        /*
-        * If we're deleting a subvolume, we need to really delete the dirent,
-        * not just emit a whiteout in the current snapshot:
+        * If we're deleting a subvolume we need to really delete the dirent,
+        * not just emit a whiteout in the current snapshot - there can only be
+        * single dirent that points to a given subvolume.
+        *
+        * IOW, we don't maintain multiple versions in different snapshots of
+        * dirents that point to subvolumes - dirents that point to subvolumes
+        * are only visible in one particular subvolume so it's not necessary,
+        * and it would be particularly confusing for fsck to have to deal with.
         */
-       if (src_type == DT_SUBVOL) {
-               bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
-               ret = bch2_btree_iter_traverse(&src_iter);
+       delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+               new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+       delete_dst = old_dst.k &&
+               bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+               new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+       if (!delete_src || !bkey_deleted(&new_src->k)) {
+               ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
                if (ret)
                        goto out;
+       }
 
-               new_src->k.p = src_iter.pos;
-               src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+       if (delete_src) {
+               bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+               ret =   bch2_btree_iter_traverse(&src_iter) ?:
+                       bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               if (ret)
+                       goto out;
        }
 
-       ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-       if (ret)
-               goto out;
+       if (delete_dst) {
+               bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+               ret =   bch2_btree_iter_traverse(&dst_iter) ?:
+                       bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               if (ret)
+                       goto out;
+       }
 
        if (mode == BCH_RENAME_EXCHANGE)
                *src_offset = new_src->k.p.offset;
@@ -458,41 +472,29 @@ out:
        return ret;
 }
 
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              subvol_inum dir,
-                              const struct bch_hash_info *hash_info,
-                              const struct qstr *name, subvol_inum *inum,
-                              unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+                            struct btree_iter *iter,
+                            subvol_inum dir,
+                            const struct bch_hash_info *hash_info,
+                            const struct qstr *name, subvol_inum *inum,
+                            unsigned flags)
 {
-       struct bkey_s_c k;
-       struct bkey_s_c_dirent d;
-       u32 snapshot;
-       int ret;
-
-       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+       int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+                                  hash_info, dir, name, flags);
        if (ret)
                return ret;
 
-       ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-                              hash_info, dir, name, flags);
-       if (ret)
-               return ret;
-
-       k = bch2_btree_iter_peek_slot(iter);
+       struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       d = bkey_s_c_to_dirent(k);
-
-       ret = bch2_dirent_read_target(trans, dir, d, inum);
+       ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
        if (ret > 0)
                ret = -ENOENT;
 err:
        if (ret)
                bch2_trans_iter_exit(trans, iter);
-
        return ret;
 }
 
@@ -504,7 +506,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
        struct btree_iter iter = { NULL };
 
        int ret = lockrestart_do(trans,
-               __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+               bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
        bch2_trans_iter_exit(trans, &iter);
        bch2_trans_put(trans);
        return ret;
index 21ffeb78f02ee3a750a39512f2fb353b594567b5..f1dd7208a58e05e8acf481e00ab5bc93731d2f74 100644 (file)
@@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *,
                       const struct qstr *, subvol_inum *, u64 *,
                       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
                               subvol_inum, const struct bch_hash_info *,
                               const struct qstr *, subvol_inum *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
index d503af2700247d8aa1257962c37df9b042ee55ec..b98e2c2b8bf06f59fa70cfe23873e51529a917b8 100644 (file)
@@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
                unsigned i;
 
                for (i = 0; i < s->v.nr_blocks; i++) {
-                       kvpfree(buf->data[i], buf->size << 9);
+                       kvfree(buf->data[i]);
                        buf->data[i] = NULL;
                }
        }
@@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
        memset(buf->valid, 0xFF, sizeof(buf->valid));
 
        for (i = 0; i < v->nr_blocks; i++) {
-               buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+               buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
                if (!buf->data[i])
                        goto err;
        }
index 8c40c2067a0471e2dde6c3dcbcdeb709565732a7..3fd33b307a77f943bea59882de951995d27987c5 100644 (file)
        x(EINVAL,                       invalid)                                \
        x(EINVAL,                       internal_fsck_err)                      \
        x(EINVAL,                       opt_parse_error)                        \
+       x(EINVAL,                       remove_with_metadata_missing_unimplemented)\
+       x(EINVAL,                       remove_would_lose_data)                 \
        x(EROFS,                        erofs_trans_commit)                     \
        x(EROFS,                        erofs_no_writes)                        \
        x(EROFS,                        erofs_journal_err)                      \
index d32c8bebe46c32f7abc1a11ad49ee80752f2a623..70a125395974076c9e7c7be95cfe9ab1c9980585 100644 (file)
@@ -2,7 +2,7 @@
 #include "bcachefs.h"
 #include "error.h"
 #include "super.h"
-#include "thread_with_file.h"
+#include <linux/thread_with_file.h>
 
 #define FSCK_ERR_RATELIMIT_NR  10
 
@@ -105,7 +105,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
        do {
                bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
 
-               int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+               int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
                if (r < 0)
                        return YN_NO;
                buf[r] = '\0';
index 66b945be10c2309a9e758b228b146047b20674e2..d8153fe27037ef46d1b2b220430f78fae78f2e35 100644 (file)
@@ -24,12 +24,12 @@ struct {                                                            \
        (fifo)->mask    = (fifo)->size                                  \
                ? roundup_pow_of_two((fifo)->size) - 1                  \
                : 0;                                                    \
-       (fifo)->data    = kvpmalloc(fifo_buf_size(fifo), (_gfp));       \
+       (fifo)->data    = kvmalloc(fifo_buf_size(fifo), (_gfp));        \
 })
 
 #define free_fifo(fifo)                                                        \
 do {                                                                   \
-       kvpfree((fifo)->data, fifo_buf_size(fifo));                     \
+       kvfree((fifo)->data);                                           \
        (fifo)->data = NULL;                                            \
 } while (0)
 
index 1c1ea0f0c692a6fdd4c262ef184bbcdda32d154f..523507e38887bf9fd4aaacf6ece326d04e6edd16 100644 (file)
@@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-                                        name, &inum, BTREE_ITER_INTENT);
+       ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+                                      name, &inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans,
                        goto err;
        }
 
+       /* Can't move across subvolumes, unless it's a subvolume root: */
+       if (src_dir.subvol != dst_dir.subvol &&
+           (!src_inode_u->bi_subvol ||
+            (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+               ret = -EXDEV;
+               goto err;
+       }
+
+       if (src_inode_u->bi_parent_subvol)
+               src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+       if ((mode == BCH_RENAME_EXCHANGE) &&
+           dst_inode_u->bi_parent_subvol)
+               dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
        src_inode_u->bi_dir             = dst_dir_u->bi_inum;
        src_inode_u->bi_dir_offset      = dst_offset;
 
index 3a4c24c28e7fa06deff38f6bb0b240a5daacda8c..3dc8630ff9fe139bd44317d72502ed9bf1f73751 100644 (file)
@@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
        if (IS_ERR(victim))
                return PTR_ERR(victim);
 
+       dir = d_inode(path.dentry);
        if (victim->d_sb->s_fs_info != c) {
                ret = -EXDEV;
                goto err;
@@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
                ret = -ENOENT;
                goto err;
        }
-       dir = d_inode(path.dentry);
        ret = __bch2_unlink(dir, victim, true);
        if (!ret) {
                fsnotify_rmdir(dir, victim);
                d_delete(victim);
        }
-       inode_unlock(dir);
 err:
+       inode_unlock(dir);
        dput(victim);
        path_put(&path);
        return ret;
index ec419b8e2c43123b42e0d84c837611fc5f6e2314..77ea61090e913555624c2a093b5d851475887797 100644 (file)
@@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
        return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
 {
-       struct bch_inode_unpacked inode_u;
-       struct bch_inode_info *inode;
-       struct btree_trans *trans;
-       struct bch_subvolume subvol;
-       int ret;
+       subvol_inum inum = inode_inum(inode);
+       struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+                                     bch2_inode_hash(inum),
+                                     bch2_iget5_test,
+                                     bch2_iget5_set,
+                                     &inum));
+       BUG_ON(!old);
 
-       inode = to_bch_ei(iget5_locked(c->vfs_sb,
-                                      bch2_inode_hash(inum),
-                                      bch2_iget5_test,
-                                      bch2_iget5_set,
-                                      &inum));
-       if (unlikely(!inode))
-               return ERR_PTR(-ENOMEM);
-       if (!(inode->v.i_state & I_NEW))
-               return &inode->v;
+       if (unlikely(old != inode)) {
+               discard_new_inode(&inode->v);
+               inode = old;
+       } else {
+               mutex_lock(&c->vfs_inodes_lock);
+               list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+               mutex_unlock(&c->vfs_inodes_lock);
+               /*
+                * we really don't want insert_inode_locked2() to be setting
+                * I_NEW...
+                */
+               unlock_new_inode(&inode->v);
+       }
 
-       trans = bch2_trans_get(c);
-       ret = lockrestart_do(trans,
-               bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-               bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+       return inode;
+}
 
-       if (!ret)
-               bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-       bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do)                                         \
+({                                                                             \
+       unsigned _saved_flags = memalloc_flags_save(_flags);                    \
+       typeof(_do) _ret = _do;                                                 \
+       memalloc_noreclaim_restore(_saved_flags);                               \
+       _ret;                                                                   \
+})
 
-       if (ret) {
-               iget_failed(&inode->v);
-               return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+
+       struct bch_inode_info *inode =
+               memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+                                 to_bch_ei(new_inode(c->vfs_sb)));
+
+       if (unlikely(!inode)) {
+               int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+               if (ret && inode)
+                       discard_new_inode(&inode->v);
+               if (ret)
+                       return ERR_PTR(ret);
        }
 
-       mutex_lock(&c->vfs_inodes_lock);
-       list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-       mutex_unlock(&c->vfs_inodes_lock);
+       return inode;
+}
 
-       unlock_new_inode(&inode->v);
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+       struct bch_inode_info *inode =
+               to_bch_ei(ilookup5_nowait(c->vfs_sb,
+                                         bch2_inode_hash(inum),
+                                         bch2_iget5_test,
+                                         &inum));
+       if (inode)
+               return &inode->v;
 
-       return &inode->v;
+       struct btree_trans *trans = bch2_trans_get(c);
+
+       struct bch_inode_unpacked inode_u;
+       struct bch_subvolume subvol;
+       int ret = lockrestart_do(trans,
+               bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+               PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+       if (!ret) {
+               bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+               inode = bch2_inode_insert(c, inode);
+       }
+       bch2_trans_put(trans);
+
+       return ret ? ERR_PTR(ret) : &inode->v;
 }
 
 struct bch_inode_info *
@@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap,
        struct bch_fs *c = dir->v.i_sb->s_fs_info;
        struct btree_trans *trans;
        struct bch_inode_unpacked dir_u;
-       struct bch_inode_info *inode, *old;
+       struct bch_inode_info *inode;
        struct bch_inode_unpacked inode_u;
        struct posix_acl *default_acl = NULL, *acl = NULL;
        subvol_inum inum;
@@ -293,7 +336,6 @@ err_before_quota:
                mutex_unlock(&dir->ei_update_lock);
        }
 
-       bch2_iget5_set(&inode->v, &inum);
        bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +346,7 @@ err_before_quota:
         * bch2_trans_exit() and dropping locks, else we could race with another
         * thread pulling the inode in and modifying it:
         */
-
-       inode->v.i_state |= I_CREATING;
-
-       old = to_bch_ei(inode_insert5(&inode->v,
-                                     bch2_inode_hash(inum),
-                                     bch2_iget5_test,
-                                     bch2_iget5_set,
-                                     &inum));
-       BUG_ON(!old);
-
-       if (unlikely(old != inode)) {
-               /*
-                * We raced, another process pulled the new inode into cache
-                * before us:
-                */
-               make_bad_inode(&inode->v);
-               iput(&inode->v);
-
-               inode = old;
-       } else {
-               mutex_lock(&c->vfs_inodes_lock);
-               list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-               mutex_unlock(&c->vfs_inodes_lock);
-               /*
-                * we really don't want insert_inode_locked2() to be setting
-                * I_NEW...
-                */
-               unlock_new_inode(&inode->v);
-       }
-
+       inode = bch2_inode_insert(c, inode);
        bch2_trans_put(trans);
 err:
        posix_acl_release(default_acl);
@@ -352,23 +365,78 @@ err_trans:
 
 /* methods */
 
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+                       subvol_inum dir, struct bch_hash_info *dir_hash_info,
+                       const struct qstr *name)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter dirent_iter = {};
+       subvol_inum inum = {};
+
+       int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+                                  dir_hash_info, dir, name, 0);
+       if (ret)
+               return ERR_PTR(ret);
+
+       struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+       if (ret > 0)
+               ret = -ENOENT;
+       if (ret)
+               goto err;
+
+       struct bch_inode_info *inode =
+               to_bch_ei(ilookup5_nowait(c->vfs_sb,
+                                         bch2_inode_hash(inum),
+                                         bch2_iget5_test,
+                                         &inum));
+       if (inode)
+               goto out;
+
+       struct bch_subvolume subvol;
+       struct bch_inode_unpacked inode_u;
+       ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+               PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+       if (bch2_err_matches(ret, ENOENT)) {
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, c, k);
+               bch_err(c, "%s points to missing inode", buf.buf);
+               printbuf_exit(&buf);
+       }
+       if (ret)
+               goto err;
+
+       bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+       inode = bch2_inode_insert(c, inode);
+out:
+       bch2_trans_iter_exit(trans, &dirent_iter);
+       return inode;
+err:
+       inode = ERR_PTR(ret);
+       goto out;
+}
+
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
                                  unsigned int flags)
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-       struct inode *vinode = NULL;
-       subvol_inum inum = { .subvol = 1 };
-       int ret;
 
-       ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
-                                &dentry->d_name, &inum);
-
-       if (!ret)
-               vinode = bch2_vfs_inode_get(c, inum);
+       struct bch_inode_info *inode;
+       bch2_trans_do(c, NULL, NULL, 0,
+               PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+                                                         &hash, &dentry->d_name)));
+       if (IS_ERR(inode))
+               inode = NULL;
 
-       return d_splice_alias(vinode, dentry);
+       return d_splice_alias(&inode->v, dentry);
 }
 
 static int bch2_mknod(struct mnt_idmap *idmap,
@@ -1371,6 +1439,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
                                struct bch_inode_unpacked *bi,
                                struct bch_subvolume *subvol)
 {
+       bch2_iget5_set(&inode->v, &inum);
        bch2_inode_update_after_write(trans, inode, bi, ~0);
 
        if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1571,7 +1640,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
         * number:
         */
        u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-       u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
        buf->f_bsize    = sb->s_blocksize;
@@ -1582,10 +1650,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files    = usage.nr_inodes + avail_inodes;
        buf->f_ffree    = avail_inodes;
 
-       fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
-              le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-       buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-       buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+       buf->f_fsid     = uuid_to_fsid(c->sb.user_uuid.b);
        buf->f_namelen  = BCH_NAME_MAX;
 
        return 0;
@@ -1881,6 +1946,7 @@ got_sb:
        sb->s_time_gran         = c->sb.nsec_per_time_unit;
        sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
        sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
+       sb->s_uuid              = c->sb.user_uuid;
        c->vfs_sb               = sb;
        strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
index 7e82c7bc0ca82289b36e4f32be5e0ce343874748..e4a8a14c46bc922983e91edcdc9ece6fe717d3eb 100644 (file)
@@ -5,7 +5,6 @@
 #include "btree_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
-#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
@@ -18,6 +17,7 @@
 #include "xattr.h"
 
 #include <linux/bsearch.h>
+#include <linux/darray.h>
 #include <linux/dcache.h> /* struct qstr */
 
 /*
@@ -100,8 +100,8 @@ err:
 }
 
 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-                         struct bch_inode_unpacked *inode,
-                         u32 *snapshot)
+                       struct bch_inode_unpacked *inode,
+                       u32 *snapshot)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -123,17 +123,15 @@ err:
        return ret;
 }
 
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
                           struct bch_hash_info hash_info,
                           subvol_inum dir, struct qstr *name,
-                          u64 *target, unsigned *type)
+                          u64 *target, unsigned *type, u32 snapshot)
 {
        struct btree_iter iter;
        struct bkey_s_c_dirent d;
-       int ret;
-
-       ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
-                              &hash_info, dir, name, 0);
+       int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+                              &hash_info, dir, name, 0, snapshot);
        if (ret)
                return ret;
 
@@ -144,34 +142,6 @@ static int __lookup_dirent(struct btree_trans *trans,
        return 0;
 }
 
-static int __write_inode(struct btree_trans *trans,
-                        struct bch_inode_unpacked *inode,
-                        u32 snapshot)
-{
-       struct bkey_inode_buf *inode_p =
-               bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-       if (IS_ERR(inode_p))
-               return PTR_ERR(inode_p);
-
-       bch2_inode_pack(inode_p, inode);
-       inode_p->inode.k.p.snapshot = snapshot;
-
-       return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-                               &inode_p->inode.k_i,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
-                           struct bch_inode_unpacked *inode,
-                           u32 snapshot)
-{
-       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                           __write_inode(trans, inode, snapshot));
-       bch_err_fn(trans->c, ret);
-       return ret;
-}
-
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
        struct bch_fs *c = trans->c;
@@ -224,15 +194,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 
        struct bch_inode_unpacked root_inode;
        struct bch_hash_info root_hash_info;
-       ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot);
+       u32 root_inode_snapshot = snapshot;
+       ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
        bch_err_msg(c, ret, "looking up root inode");
        if (ret)
                return ret;
 
        root_hash_info = bch2_hash_info_init(c, &root_inode);
 
-       ret = __lookup_dirent(trans, root_hash_info, root_inum,
-                             &lostfound_str, &inum, &d_type);
+       ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+                             &lostfound_str, &inum, &d_type, snapshot);
        if (bch2_err_matches(ret, ENOENT))
                goto create_lostfound;
 
@@ -250,7 +221,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
         * shouldn't exist here:
         */
        ret = lookup_inode(trans, inum, lostfound, &snapshot);
-       bch_err_msg(c, ret, "looking up lost+found");
+       bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+                   inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
        return ret;
 
 create_lostfound:
@@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans,
        if (S_ISDIR(inode->bi_mode)) {
                lostfound.bi_nlink++;
 
-               ret = __write_inode(trans, &lostfound, U32_MAX);
+               ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
                if (ret)
                        return ret;
        }
@@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans,
        inode->bi_dir           = lostfound.bi_inum;
        inode->bi_dir_offset    = dir_offset;
 
-       return __write_inode(trans, inode, inode_snapshot);
+       return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -722,7 +694,7 @@ static int hash_redo_key(struct btree_trans *trans,
        delete->k.p = k_iter->pos;
        return  bch2_btree_iter_traverse(k_iter) ?:
                bch2_trans_update(trans, k_iter, delete, 0) ?:
-               bch2_hash_set_snapshot(trans, desc, hash_info,
+               bch2_hash_set_in_snapshot(trans, desc, hash_info,
                                       (subvol_inum) { 0, k.k->p.inode },
                                       k.k->p.snapshot, tmp,
                                       BCH_HASH_SET_MUST_CREATE,
@@ -861,7 +833,8 @@ static int check_inode(struct btree_trans *trans,
 
                u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
 
-               ret = __write_inode(trans, &u, iter->pos.snapshot);
+               ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
                bch_err_msg(c, ret, "in fsck updating inode");
                if (ret)
                        return ret;
@@ -950,8 +923,33 @@ static int check_inode(struct btree_trans *trans,
                do_update = true;
        }
 
+       if (u.bi_subvol) {
+               struct bch_subvolume s;
+
+               ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+               if (ret && !bch2_err_matches(ret, ENOENT))
+                       goto err;
+
+               if (fsck_err_on(ret,
+                               c, inode_bi_subvol_missing,
+                               "inode %llu:%u bi_subvol points to missing subvolume %u",
+                               u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+                   fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+                               !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+                                                          k.k->p.snapshot),
+                               c, inode_bi_subvol_wrong,
+                               "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+                               u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+                               le64_to_cpu(s.inode),
+                               le32_to_cpu(s.snapshot))) {
+                       u.bi_subvol = 0;
+                       u.bi_parent_subvol = 0;
+                       do_update = true;
+               }
+       }
+
        if (do_update) {
-               ret = __write_inode(trans, &u, iter->pos.snapshot);
+               ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
                bch_err_msg(c, ret, "in fsck updating inode");
                if (ret)
                        return ret;
@@ -1032,7 +1030,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
                                w->last_pos.inode, i->snapshot,
                                i->inode.bi_sectors, i->count)) {
                        i->inode.bi_sectors = i->count;
-                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+                       ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
@@ -1481,7 +1479,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
                                "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
                                w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
                        i->inode.bi_nlink = i->count;
-                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+                       ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
@@ -1491,16 +1489,15 @@ fsck_err:
        return ret ?: trans_was_restarted(trans, restart_count);
 }
 
-static int check_dirent_target(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_s_c_dirent d,
-                              struct bch_inode_unpacked *target,
-                              u32 target_snapshot)
+static int check_inode_backpointer(struct btree_trans *trans,
+                                  struct btree_iter *iter,
+                                  struct bkey_s_c_dirent d,
+                                  struct bch_inode_unpacked *target,
+                                  u32 target_snapshot)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_i_dirent *n;
-       struct printbuf buf = PRINTBUF;
        struct btree_iter bp_iter = { NULL };
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (!target->bi_dir &&
@@ -1508,7 +1505,7 @@ static int check_dirent_target(struct btree_trans *trans,
                target->bi_dir          = d.k->p.inode;
                target->bi_dir_offset   = d.k->p.offset;
 
-               ret = __write_inode(trans, target, target_snapshot);
+               ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
                if (ret)
                        goto err;
        }
@@ -1548,7 +1545,7 @@ static int check_dirent_target(struct btree_trans *trans,
                        target->bi_nlink++;
                        target->bi_flags &= ~BCH_INODE_unlinked;
 
-                       ret = __write_inode(trans, target, target_snapshot);
+                       ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
                        if (ret)
                                goto err;
                }
@@ -1566,11 +1563,34 @@ static int check_dirent_target(struct btree_trans *trans,
                        target->bi_dir          = d.k->p.inode;
                        target->bi_dir_offset   = d.k->p.offset;
 
-                       ret = __write_inode(trans, target, target_snapshot);
+                       ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
                        if (ret)
                                goto err;
                }
        }
+out:
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       printbuf_exit(&buf);
+       bch_err_fn(c, ret);
+       return ret;
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c_dirent d,
+                              struct bch_inode_unpacked *target,
+                              u32 target_snapshot)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i_dirent *n;
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
+       if (ret)
+               goto err;
 
        if (fsck_err_on(d.v->d_type != inode_d_type(target),
                        c, dirent_d_type_wrong,
@@ -1614,15 +1634,65 @@ static int check_dirent_target(struct btree_trans *trans,
 
                d = dirent_i_to_s_c(n);
        }
-out:
 err:
 fsck_err:
-       bch2_trans_iter_exit(trans, &bp_iter);
        printbuf_exit(&buf);
        bch_err_fn(c, ret);
        return ret;
 }
 
+static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
+                              struct bkey_s_c_dirent d)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_inode_unpacked subvol_root;
+       u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+       u32 target_snapshot;
+       u64 target_inum;
+       int ret = 0;
+
+       ret = subvol_lookup(trans, target_subvol,
+                             &target_snapshot, &target_inum);
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               return ret;
+
+       if (fsck_err_on(ret, c, dirent_to_missing_subvol,
+                       "dirent points to missing subvolume %u",
+                       le32_to_cpu(d.v->d_child_subvol)))
+               return __remove_dirent(trans, d.k->p);
+
+       ret = lookup_inode(trans, target_inum,
+                          &subvol_root, &target_snapshot);
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               return ret;
+
+       if (fsck_err_on(ret, c, subvol_to_missing_root,
+                       "subvolume %u points to missing subvolume root %llu",
+                       target_subvol,
+                       target_inum)) {
+               bch_err(c, "repair not implemented yet");
+               return -EINVAL;
+       }
+
+       if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+                       c, subvol_root_wrong_bi_subvol,
+                       "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+                       target_inum,
+                       subvol_root.bi_subvol, target_subvol)) {
+               subvol_root.bi_subvol = target_subvol;
+               ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+               if (ret)
+                       return ret;
+       }
+
+       ret = check_dirent_target(trans, iter, d, &subvol_root,
+                                 target_snapshot);
+       if (ret)
+               return ret;
+fsck_err:
+       return ret;
+}
+
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bkey_s_c k,
                        struct bch_hash_info *hash_info,
@@ -1707,50 +1777,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
        d = bkey_s_c_to_dirent(k);
 
        if (d.v->d_type == DT_SUBVOL) {
-               struct bch_inode_unpacked subvol_root;
-               u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-               u32 target_snapshot;
-               u64 target_inum;
-
-               ret = subvol_lookup(trans, target_subvol,
-                                     &target_snapshot, &target_inum);
-               if (ret && !bch2_err_matches(ret, ENOENT))
-                       goto err;
-
-               if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-                               "dirent points to missing subvolume %u",
-                               le32_to_cpu(d.v->d_child_subvol))) {
-                       ret = __remove_dirent(trans, d.k->p);
-                       goto err;
-               }
-
-               ret = lookup_inode(trans, target_inum,
-                                  &subvol_root, &target_snapshot);
-               if (ret && !bch2_err_matches(ret, ENOENT))
-                       goto err;
-
-               if (fsck_err_on(ret, c, subvol_to_missing_root,
-                               "subvolume %u points to missing subvolume root %llu",
-                               target_subvol,
-                               target_inum)) {
-                       bch_err(c, "repair not implemented yet");
-                       ret = -EINVAL;
-                       goto err;
-               }
-
-               if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
-                               c, subvol_root_wrong_bi_subvol,
-                               "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
-                               target_inum,
-                               subvol_root.bi_subvol, target_subvol)) {
-                       subvol_root.bi_subvol = target_subvol;
-                       ret = __write_inode(trans, &subvol_root, target_snapshot);
-                       if (ret)
-                               goto err;
-               }
-
-               ret = check_dirent_target(trans, iter, d, &subvol_root,
-                                         target_snapshot);
+               ret = check_subvol_dirent(trans, iter, d);
                if (ret)
                        goto err;
        } else {
@@ -1776,12 +1803,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        if (ret)
                                goto err;
                }
-       }
-
-       if (d.v->d_type == DT_DIR)
-               for_each_visible_inode(c, s, dir, equiv.snapshot, i)
-                       i->count++;
 
+               if (d.v->d_type == DT_DIR)
+                       for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+                               i->count++;
+       }
 out:
 err:
 fsck_err:
@@ -1919,7 +1945,7 @@ static int check_root_trans(struct btree_trans *trans)
                                0, NULL);
                root_inode.bi_inum = inum;
 
-               ret = __write_inode(trans, &root_inode, snapshot);
+               ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
                bch_err_msg(c, ret, "writing root inode");
        }
 err:
@@ -2291,7 +2317,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
                        u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
                        bch2_inode_nlink_get(&u), link->count)) {
                bch2_inode_nlink_set(&u, link->count);
-               ret = __write_inode(trans, &u, k.k->p.snapshot);
+               ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
        }
 fsck_err:
        return ret;
index 086f0090b03a4015388dce49388ba5951940cb0a..dbe37ccc751958d351d622bcb145b56150fb9629 100644 (file)
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
        return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
                    struct btree_iter *iter,
                    struct bch_inode_unpacked *inode,
                    subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
        return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+                        struct bch_inode_unpacked *inode,
+                        u32 snapshot)
+{
+       struct bkey_inode_buf *inode_p =
+               bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
+
+       bch2_inode_pack(inode_p, inode);
+       inode_p->inode.k.p.snapshot = snapshot;
+
+       return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+                               &inode_p->inode.k_i,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+                           struct bch_inode_unpacked *inode,
+                           u32 snapshot)
+{
+       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                           __bch2_fsck_write_inode(trans, inode, snapshot));
+       bch_err_fn(trans->c, ret);
+       return ret;
+}
+
 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 {
        struct bch_inode_unpacked u;
index b63f312581cfa5ea9975fae6fdcd2d1518d13d54..9a9353c001c2a5fa62e80dc1e2b2705cc8534ab5 100644 (file)
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+                   struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
                    struct bch_inode_unpacked *, subvol_inum, unsigned);
 
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
        return bch2_inode_write_flags(trans, iter, inode, 0);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
 void bch2_inode_init_early(struct bch_fs *,
                           struct bch_inode_unpacked *);
 void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
index 3c574d8873a1e209dc7f7f48faacf9928f8a1272..dce136cd227132b68a802c76b173dc3eae95e552 100644 (file)
@@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
                container_of(wop, struct promote_op, write.op);
        struct bch_fs *c = op->write.op.c;
 
-       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+       time_stats_update(&c->times[BCH_TIME_data_promote],
                               op->start_time);
        promote_free(c, op);
 }
@@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
        if (rbio->start_time)
-               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+               time_stats_update(&rbio->c->times[BCH_TIME_data_read],
                                       rbio->start_time);
        bio_endio(&rbio->bio);
 }
index ef3a53f9045af2591ab1f9e272dd9d6151250444..13b3514d86511db827370d0d0020a7aa07cd7c27 100644 (file)
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
        bch2_congested_acct(ca, io_latency, now, rw);
 
-       __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+       __time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
 }
 
 #endif
@@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
 
        EBUG_ON(op->open_buckets.nr);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+       time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
        bch2_disk_reservation_put(c, &op->res);
 
        if (!(op->flags & BCH_WRITE_MOVE))
index bc890776eb57933a5931edd2a2f07570f52b7ab3..214c8030048292430b07721bd04bac8ea3c44f50 100644 (file)
@@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = {
        NULL
 };
 
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+       return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+       return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+       return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+       return __journal_entry_is_open(j->reservations);
+}
+
 static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
 {
        union journal_res_state s = READ_ONCE(j->reservations);
@@ -54,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
        prt_printf(out, "%li jiffies", buf->expires - jiffies);
        prt_newline(out);
 
+       if (buf->write_done)
+               prt_printf(out, "write done\n");
+       else if (buf->write_allocated)
+               prt_printf(out, "write allocated\n");
+       else if (buf->write_started)
+               prt_printf(out, "write started\n");
+
        printbuf_indent_sub(out, 2);
 }
 
@@ -66,26 +93,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
             seq <= journal_cur_seq(j);
             seq++)
                bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-       return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-       return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-       return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-       return __journal_entry_is_open(j->reservations);
+       prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
 }
 
 static inline struct journal_buf *
@@ -174,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
        return stuck;
 }
 
+void bch2_journal_do_writes(struct journal *j)
+{
+       for (u64 seq = journal_last_unwritten_seq(j);
+            seq <= journal_cur_seq(j);
+            seq++) {
+               unsigned idx = seq & JOURNAL_BUF_MASK;
+               struct journal_buf *w = j->buf + idx;
+
+               if (w->write_started && !w->write_allocated)
+                       break;
+               if (w->write_started)
+                       continue;
+
+               if (!journal_state_count(j->reservations, idx)) {
+                       w->write_started = true;
+                       closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+               }
+
+               break;
+       }
+}
+
 /*
  * Final processing when the last reference of a journal buffer has been
  * dropped. Drop the pin list reference acquired at journal entry open and write
  * the buffer, if requested.
  */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
        lockdep_assert_held(&j->lock);
 
        if (__bch2_journal_pin_put(j, seq))
                bch2_journal_reclaim_fast(j);
-       if (write)
-               closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+       bch2_journal_do_writes(j);
 }
 
 /*
@@ -380,11 +407,14 @@ static int journal_entry_open(struct journal *j)
        BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 
        bkey_extent_init(&buf->key);
-       buf->noflush    = false;
-       buf->must_flush = false;
-       buf->separate_flush = false;
-       buf->flush_time = 0;
+       buf->noflush            = false;
+       buf->must_flush         = false;
+       buf->separate_flush     = false;
+       buf->flush_time         = 0;
        buf->need_flush_to_write_buffer = true;
+       buf->write_started      = false;
+       buf->write_allocated    = false;
+       buf->write_done         = false;
 
        memset(buf->data, 0, sizeof(*buf->data));
        buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +448,10 @@ static int journal_entry_open(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       mod_delayed_work(c->io_complete_wq,
-                        &j->write_work,
-                        msecs_to_jiffies(c->opts.journal_flush_delay));
+       if (nr_unwritten_journal_entries(j) == 1)
+               mod_delayed_work(j->wq,
+                                &j->write_work,
+                                msecs_to_jiffies(c->opts.journal_flush_delay));
        journal_wake(j);
 
        if (j->early_journal_entries.nr)
@@ -445,20 +476,16 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
        struct journal *j = container_of(work, struct journal, write_work.work);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       long delta;
 
        spin_lock(&j->lock);
-       if (!__journal_entry_is_open(j->reservations))
-               goto unlock;
-
-       delta = journal_cur_buf(j)->expires - jiffies;
+       if (__journal_entry_is_open(j->reservations)) {
+               long delta = journal_cur_buf(j)->expires - jiffies;
 
-       if (delta > 0)
-               mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
-       else
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+               if (delta > 0)
+                       mod_delayed_work(j->wq, &j->write_work, delta);
+               else
+                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+       }
        spin_unlock(&j->lock);
 }
 
@@ -473,33 +500,32 @@ retry:
        if (journal_res_get_fast(j, res, flags))
                return 0;
 
-       if (bch2_journal_error(j))
-               return -BCH_ERR_erofs_journal_err;
+       if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+               ret = JOURNAL_ERR_journal_full;
+               can_discard = j->can_discard;
+               goto out;
+       }
 
-       spin_lock(&j->lock);
+       if (j->blocked)
+               return -BCH_ERR_journal_res_get_blocked;
 
-       /* check once more in case somebody else shut things down... */
-       if (bch2_journal_error(j)) {
-               spin_unlock(&j->lock);
+       if (bch2_journal_error(j))
                return -BCH_ERR_erofs_journal_err;
+
+       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+               ret = JOURNAL_ERR_max_in_flight;
+               goto out;
        }
 
+       spin_lock(&j->lock);
+
        /*
         * Recheck after taking the lock, so we don't race with another thread
         * that just did journal_entry_open() and call bch2_journal_entry_close()
         * unnecessarily
         */
        if (journal_res_get_fast(j, res, flags)) {
-               spin_unlock(&j->lock);
-               return 0;
-       }
-
-       if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-               /*
-                * Don't want to close current journal entry, just need to
-                * invoke reclaim:
-                */
-               ret = JOURNAL_ERR_journal_full;
+               ret = 0;
                goto unlock;
        }
 
@@ -515,30 +541,30 @@ retry:
                j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
        __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-       ret = journal_entry_open(j);
-
-       if (ret == JOURNAL_ERR_max_in_flight) {
-               track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-                                  &j->max_in_flight_start, true);
-               if (trace_journal_entry_full_enabled()) {
-                       struct printbuf buf = PRINTBUF;
-                       buf.atomic++;
-
-                       bch2_journal_bufs_to_text(&buf, j);
-                       trace_journal_entry_full(c, buf.buf);
-                       printbuf_exit(&buf);
-               }
-               count_event(c, journal_entry_full);
-       }
+       ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
 unlock:
        can_discard = j->can_discard;
        spin_unlock(&j->lock);
-
-       if (!ret)
+out:
+       if (ret == JOURNAL_ERR_retry)
                goto retry;
+       if (!ret)
+               return 0;
+
        if (journal_error_check_stuck(j, ret, flags))
                ret = -BCH_ERR_journal_res_get_blocked;
 
+       if (ret == JOURNAL_ERR_max_in_flight &&
+           track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+               struct printbuf buf = PRINTBUF;
+               prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+               bch2_journal_bufs_to_text(&buf, j);
+               trace_journal_entry_full(c, buf.buf);
+               printbuf_exit(&buf);
+               count_event(c, journal_entry_full);
+       }
+
        /*
         * Journal is full - can't rely on reclaim from work item due to
         * freezing:
@@ -727,7 +753,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
        if (!ret)
-               bch2_time_stats_update(j->flush_seq_time, start_time);
+               time_stats_update(j->flush_seq_time, start_time);
 
        return ret ?: ret2 < 0 ? ret2 : 0;
 }
@@ -1157,7 +1183,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
        struct journal_replay *i, **_i;
        struct genradix_iter iter;
        bool had_entries = false;
-       unsigned ptr;
        u64 last_seq = cur_seq, nr, seq;
 
        genradix_for_each_reverse(&c->journal_entries, iter, _i) {
@@ -1211,8 +1236,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
                p = journal_seq_pin(j, seq);
 
                p->devs.nr = 0;
-               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-                       bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+               darray_for_each(i->ptrs, ptr)
+                       bch2_dev_list_add_dev(&p->devs, ptr->dev);
 
                had_entries = true;
        }
@@ -1240,13 +1265,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-       kfree(ca->journal.bio);
-       kfree(ca->journal.buckets);
-       kfree(ca->journal.bucket_seq);
+       struct journal_device *ja = &ca->journal;
+
+       for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+               kfree(ja->bio[i]);
+               ja->bio[i] = NULL;
+       }
 
-       ca->journal.bio         = NULL;
-       ca->journal.buckets     = NULL;
-       ca->journal.bucket_seq  = NULL;
+       kfree(ja->buckets);
+       kfree(ja->bucket_seq);
+       ja->buckets     = NULL;
+       ja->bucket_seq  = NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1285,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
                bch2_sb_field_get(sb, journal);
        struct bch_sb_field_journal_v2 *journal_buckets_v2 =
                bch2_sb_field_get(sb, journal_v2);
-       unsigned i, nr_bvecs;
 
        ja->nr = 0;
 
        if (journal_buckets_v2) {
                unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
 
-               for (i = 0; i < nr; i++)
+               for (unsigned i = 0; i < nr; i++)
                        ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
        } else if (journal_buckets) {
                ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1301,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
        if (!ja->bucket_seq)
                return -BCH_ERR_ENOMEM_dev_journal_init;
 
-       nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+       unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-       ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-       if (!ca->journal.bio)
-               return -BCH_ERR_ENOMEM_dev_journal_init;
+       for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+               ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+                                    nr_bvecs), GFP_KERNEL);
+               if (!ja->bio[i])
+                       return -BCH_ERR_ENOMEM_dev_journal_init;
 
-       bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+               ja->bio[i]->ca = ca;
+               ja->bio[i]->buf_idx = i;
+               bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+       }
 
        ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->buckets)
@@ -1287,14 +1320,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
        if (journal_buckets_v2) {
                unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-               unsigned j, dst = 0;
+               unsigned dst = 0;
 
-               for (i = 0; i < nr; i++)
-                       for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+               for (unsigned i = 0; i < nr; i++)
+                       for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
                                ja->buckets[dst++] =
                                        le64_to_cpu(journal_buckets_v2->d[i].start) + j;
        } else if (journal_buckets) {
-               for (i = 0; i < ja->nr; i++)
+               for (unsigned i = 0; i < ja->nr; i++)
                        ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
        }
 
@@ -1303,19 +1336,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-       unsigned i;
+       if (j->wq)
+               destroy_workqueue(j->wq);
 
        darray_exit(&j->early_journal_entries);
 
-       for (i = 0; i < ARRAY_SIZE(j->buf); i++)
-               kvpfree(j->buf[i].data, j->buf[i].buf_size);
+       for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+               kvfree(j->buf[i].data);
        free_fifo(&j->pin);
 }
 
 int bch2_fs_journal_init(struct journal *j)
 {
        static struct lock_class_key res_key;
-       unsigned i;
 
        mutex_init(&j->buf_lock);
        spin_lock_init(&j->lock);
@@ -1336,14 +1369,20 @@ int bch2_fs_journal_init(struct journal *j)
        if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
                return -BCH_ERR_ENOMEM_journal_pin_fifo;
 
-       for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+       for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
                j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
-               j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+               j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
                if (!j->buf[i].data)
                        return -BCH_ERR_ENOMEM_journal_buf;
+               j->buf[i].idx = i;
        }
 
        j->pin.front = j->pin.back = 1;
+
+       j->wq = alloc_workqueue("bcachefs_journal",
+                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+       if (!j->wq)
+               return -BCH_ERR_ENOMEM_fs_other_alloc;
        return 0;
 }
 
@@ -1455,7 +1494,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *pin;
-       unsigned i;
 
        spin_lock(&j->lock);
        *seq = max(*seq, j->pin.front);
@@ -1473,7 +1511,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
        prt_newline(out);
        printbuf_indent_add(out, 2);
 
-       for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
                list_for_each_entry(pin, &pin_list->list[i], list) {
                        prt_printf(out, "\t%px %ps", pin, pin->flush);
                        prt_newline(out);
index 4544ce24bb8a654e62be91c5d7e0242e51893c1c..7c7528f839c567f5d1398cbdf890a5433818253d 100644 (file)
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 }
 
 bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
 
        s = journal_state_buf_put(j, idx);
        if (!journal_state_count(s, idx))
-               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+               bch2_journal_buf_put_final(j, seq);
 }
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
        s = journal_state_buf_put(j, idx);
        if (!journal_state_count(s, idx)) {
                spin_lock(&j->lock);
-               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+               bch2_journal_buf_put_final(j, seq);
                spin_unlock(&j->lock);
        }
 }
index bfd6585e746da45880da9b5ad8fb502586cbf933..e31e215ff66d59c5ec7e46b5d312573df46f1fd4 100644 (file)
 #include "sb-clean.h"
 #include "trace.h"
 
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                              struct journal_replay *j)
+{
+       darray_for_each(j->ptrs, i) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+               u64 offset;
+
+               div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+               if (i != j->ptrs.data)
+                       prt_printf(out, " ");
+               prt_printf(out, "%u:%u:%u (sector %llu)",
+                          i->dev, i->bucket, i->bucket_offset, i->sector);
+       }
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+                                       struct journal_replay *j)
+{
+       prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+       bch2_journal_ptrs_to_text(out, c, j);
+
+       struct jset_entry *entry;
+       for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+               struct jset_entry_datetime *datetime =
+                       container_of(entry, struct jset_entry_datetime, entry);
+               bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+               break;
+       }
+}
+
 static struct nonce journal_nonce(const struct jset *jset)
 {
        return (struct nonce) {{
@@ -52,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c,
 
        BUG_ON(*p != i);
        *p = NULL;
-       kvpfree(i, offsetof(struct journal_replay, j) +
-               vstruct_bytes(&i->j));
+       kvfree(i);
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -84,9 +115,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 {
        struct genradix_iter iter;
        struct journal_replay **_i, *i, *dup;
-       struct journal_ptr *ptr;
        size_t bytes = vstruct_bytes(j);
        u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+       struct printbuf buf = PRINTBUF;
        int ret = JOURNAL_ENTRY_ADD_OK;
 
        /* Is this entry older than the range we need? */
@@ -131,72 +162,61 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
         */
        dup = *_i;
        if (dup) {
-               if (bytes == vstruct_bytes(&dup->j) &&
-                   !memcmp(j, &dup->j, bytes)) {
-                       i = dup;
-                       goto found;
-               }
+               bool identical = bytes == vstruct_bytes(&dup->j) &&
+                       !memcmp(j, &dup->j, bytes);
+               bool not_identical = !identical &&
+                       entry_ptr.csum_good &&
+                       dup->csum_good;
+
+               bool same_device = false;
+               darray_for_each(dup->ptrs, ptr)
+                       if (ptr->dev == ca->dev_idx)
+                               same_device = true;
+
+               ret = darray_push(&dup->ptrs, entry_ptr);
+               if (ret)
+                       goto out;
 
-               if (!entry_ptr.csum_good) {
-                       i = dup;
-                       goto found;
-               }
+               bch2_journal_replay_to_text(&buf, c, dup);
+
+               fsck_err_on(same_device,
+                           c, journal_entry_dup_same_device,
+                           "duplicate journal entry on same device\n  %s",
+                           buf.buf);
 
-               if (!dup->csum_good)
+               fsck_err_on(not_identical,
+                           c, journal_entry_replicas_data_mismatch,
+                           "found duplicate but non identical journal entries\n  %s",
+                           buf.buf);
+
+               if (entry_ptr.csum_good && !identical)
                        goto replace;
 
-               fsck_err(c, journal_entry_replicas_data_mismatch,
-                        "found duplicate but non identical journal entries (seq %llu)",
-                        le64_to_cpu(j->seq));
-               i = dup;
-               goto found;
+               goto out;
        }
 replace:
-       i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+       i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
        if (!i)
                return -BCH_ERR_ENOMEM_journal_entry_add;
 
-       i->nr_ptrs      = 0;
+       darray_init(&i->ptrs);
        i->csum_good    = entry_ptr.csum_good;
        i->ignore       = false;
        unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-       i->ptrs[i->nr_ptrs++] = entry_ptr;
 
        if (dup) {
-               if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
-                       bch_err(c, "found too many copies of journal entry %llu",
-                               le64_to_cpu(i->j.seq));
-                       dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
-               }
-
                /* The first ptr should represent the jset we kept: */
-               memcpy(i->ptrs + i->nr_ptrs,
-                      dup->ptrs,
-                      sizeof(dup->ptrs[0]) * dup->nr_ptrs);
-               i->nr_ptrs += dup->nr_ptrs;
+               darray_for_each(dup->ptrs, ptr)
+                       darray_push(&i->ptrs, *ptr);
                __journal_replay_free(c, dup);
+       } else {
+               darray_push(&i->ptrs, entry_ptr);
        }
 
        *_i = i;
-       return 0;
-found:
-       for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
-               if (ptr->dev == ca->dev_idx) {
-                       bch_err(c, "duplicate journal entry %llu on same device",
-                               le64_to_cpu(i->j.seq));
-                       goto out;
-               }
-       }
-
-       if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
-               bch_err(c, "found too many copies of journal entry %llu",
-                       le64_to_cpu(i->j.seq));
-               goto out;
-       }
-
-       i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
+       printbuf_exit(&buf);
        return ret;
 }
 
@@ -741,6 +761,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
        journal_entry_btree_keys_to_text(out, c, entry);
 }
 
+static int journal_entry_datetime_validate(struct bch_fs *c,
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
+{
+       unsigned bytes = vstruct_bytes(entry);
+       unsigned expected = 16;
+       int ret = 0;
+
+       if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+                                c, version, jset, entry,
+                                journal_entry_dev_usage_bad_size,
+                                "bad size (%u < %u)",
+                                bytes, expected)) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+fsck_err:
+       return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+       struct jset_entry_datetime *datetime =
+               container_of(entry, struct jset_entry_datetime, entry);
+
+       bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
                        struct jset_entry *, unsigned, int,
@@ -913,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
                return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
        new_size = roundup_pow_of_two(new_size);
-       n = kvpmalloc(new_size, GFP_KERNEL);
+       n = kvmalloc(new_size, GFP_KERNEL);
        if (!n)
                return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
-       kvpfree(b->data, b->size);
+       kvfree(b->data);
        b->data = n;
        b->size = new_size;
        return 0;
@@ -1102,16 +1153,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
                if (!r)
                        continue;
 
-               for (i = 0; i < r->nr_ptrs; i++) {
-                       if (r->ptrs[i].dev == ca->dev_idx) {
-                               unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+               darray_for_each(r->ptrs, i)
+                       if (i->dev == ca->dev_idx) {
+                               unsigned wrote = bucket_remainder(ca, i->sector) +
                                        vstruct_sectors(&r->j, c->block_bits);
 
-                               ja->cur_idx = r->ptrs[i].bucket;
+                               ja->cur_idx = i->bucket;
                                ja->sectors_free = ca->mi.bucket_size - wrote;
                                goto found;
                        }
-               }
        }
 found:
        mutex_unlock(&jlist->lock);
@@ -1144,7 +1194,7 @@ found:
                ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
        bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-       kvpfree(buf.data, buf.size);
+       kvfree(buf.data);
        percpu_ref_put(&ca->io_ref);
        closure_return(cl);
        return;
@@ -1155,27 +1205,6 @@ err:
        goto out;
 }
 
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-                              struct journal_replay *j)
-{
-       unsigned i;
-
-       for (i = 0; i < j->nr_ptrs; i++) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
-               u64 offset;
-
-               div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
-               if (i)
-                       prt_printf(out, " ");
-               prt_printf(out, "%u:%u:%u (sector %llu)",
-                      j->ptrs[i].dev,
-                      j->ptrs[i].bucket,
-                      j->ptrs[i].bucket_offset,
-                      j->ptrs[i].sector);
-       }
-}
-
 int bch2_journal_read(struct bch_fs *c,
                      u64 *last_seq,
                      u64 *blacklist_seq,
@@ -1353,32 +1382,31 @@ int bch2_journal_read(struct bch_fs *c,
                        .e.data_type = BCH_DATA_journal,
                        .e.nr_required = 1,
                };
-               unsigned ptr;
 
                i = *_i;
                if (!i || i->ignore)
                        continue;
 
-               for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+               darray_for_each(i->ptrs, ptr) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-                       if (!i->ptrs[ptr].csum_good)
-                               bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+                       if (!ptr->csum_good)
+                               bch_err_dev_offset(ca, ptr->sector,
                                                   "invalid journal checksum, seq %llu%s",
                                                   le64_to_cpu(i->j.seq),
                                                   i->csum_good ? " (had good copy on another device)" : "");
                }
 
                ret = jset_validate(c,
-                                   bch_dev_bkey_exists(c, i->ptrs[0].dev),
+                                   bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
                                    &i->j,
-                                   i->ptrs[0].sector,
+                                   i->ptrs.data[0].sector,
                                    READ);
                if (ret)
                        goto err;
 
-               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-                       replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+               darray_for_each(i->ptrs, ptr)
+                       replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
 
                bch2_replicas_entry_sort(&replicas.e);
 
@@ -1545,7 +1573,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
        if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
                return;
 
-       new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+       new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
        if (!new_buf)
                return;
 
@@ -1556,7 +1584,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
        swap(buf->buf_size,     new_size);
        spin_unlock(&j->lock);
 
-       kvpfree(new_buf, new_size);
+       kvfree(new_buf);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1566,17 +1594,17 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 
 static CLOSURE_CALLBACK(journal_write_done)
 {
-       closure_type(j, struct journal, io);
+       closure_type(w, struct journal_buf, io);
+       struct journal *j = container_of(w, struct journal, buf[w->idx]);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *w = journal_last_unwritten_buf(j);
        struct bch_replicas_padded replicas;
        union journal_res_state old, new;
-       u64 v, seq;
+       u64 v, seq = le64_to_cpu(w->data->seq);
        int err = 0;
 
-       bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
-                              ? j->flush_write_time
-                              : j->noflush_write_time, j->write_start_time);
+       time_stats_update(!JSET_NO_FLUSH(w->data)
+                         ? j->flush_write_time
+                         : j->noflush_write_time, j->write_start_time);
 
        if (!w->devs_written.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
@@ -1591,63 +1619,68 @@ static CLOSURE_CALLBACK(journal_write_done)
        if (err)
                bch2_fatal_error(c);
 
-       spin_lock(&j->lock);
-       seq = le64_to_cpu(w->data->seq);
+       closure_debug_destroy(cl);
 
+       spin_lock(&j->lock);
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = w->devs_written;
+       if (err && (!j->err_seq || seq < j->err_seq))
+               j->err_seq      = seq;
+       w->write_done = true;
+
+       bool completed = false;
 
-       if (!err) {
-               if (!JSET_NO_FLUSH(w->data)) {
+       for (seq = journal_last_unwritten_seq(j);
+            seq <= journal_cur_seq(j);
+            seq++) {
+               w = j->buf + (seq & JOURNAL_BUF_MASK);
+               if (!w->write_done)
+                       break;
+
+               if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
                        j->flushed_seq_ondisk = seq;
                        j->last_seq_ondisk = w->last_seq;
 
                        bch2_do_discards(c);
                        closure_wake_up(&c->freelist_wait);
-
                        bch2_reset_alloc_cursors(c);
                }
-       } else if (!j->err_seq || seq < j->err_seq)
-               j->err_seq      = seq;
 
-       j->seq_ondisk           = seq;
+               j->seq_ondisk = seq;
 
-       /*
-        * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-        * more buckets:
-        *
-        * Must come before signaling write completion, for
-        * bch2_fs_journal_stop():
-        */
-       if (j->watermark != BCH_WATERMARK_stripe)
-               journal_reclaim_kick(&c->journal);
+               /*
+                * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+                * more buckets:
+                *
+                * Must come before signaling write completion, for
+                * bch2_fs_journal_stop():
+                */
+               if (j->watermark != BCH_WATERMARK_stripe)
+                       journal_reclaim_kick(&c->journal);
 
-       /* also must come before signalling write completion: */
-       closure_debug_destroy(cl);
+               v = atomic64_read(&j->reservations.counter);
+               do {
+                       old.v = new.v = v;
+                       BUG_ON(journal_state_count(new, new.unwritten_idx));
+                       BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
-       v = atomic64_read(&j->reservations.counter);
-       do {
-               old.v = new.v = v;
-               BUG_ON(journal_state_count(new, new.unwritten_idx));
+                       new.unwritten_idx++;
+               } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
-               new.unwritten_idx++;
-       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
-                                      old.v, new.v)) != old.v);
+               completed = true;
+       }
 
-       bch2_journal_reclaim_fast(j);
-       bch2_journal_space_available(j);
+       if (completed) {
+               bch2_journal_reclaim_fast(j);
+               bch2_journal_space_available(j);
 
-       track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-                          &j->max_in_flight_start, false);
+               track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
-       closure_wake_up(&w->wait);
-       journal_wake(j);
+               closure_wake_up(&w->wait);
+               journal_wake(j);
+       }
 
-       if (!journal_state_count(new, new.unwritten_idx) &&
-           journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
-               spin_unlock(&j->lock);
-               closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
-       } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+       if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
                   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
                struct journal_buf *buf = journal_cur_buf(j);
                long delta = buf->expires - jiffies;
@@ -1657,46 +1690,46 @@ static CLOSURE_CALLBACK(journal_write_done)
                 * previous entries still in flight - the current journal entry
                 * might want to be written now:
                 */
-
-               spin_unlock(&j->lock);
-               mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
-       } else {
-               spin_unlock(&j->lock);
+               mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
        }
+
+       spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
 {
-       struct bch_dev *ca = bio->bi_private;
+       struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+       struct bch_dev *ca = jbio->ca;
        struct journal *j = &ca->fs->journal;
-       struct journal_buf *w = journal_last_unwritten_buf(j);
-       unsigned long flags;
+       struct journal_buf *w = j->buf + jbio->buf_idx;
 
        if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
                               "error writing journal entry %llu: %s",
                               le64_to_cpu(w->data->seq),
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("journal")) {
+               unsigned long flags;
+
                spin_lock_irqsave(&j->err_lock, flags);
                bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
                spin_unlock_irqrestore(&j->err_lock, flags);
        }
 
-       closure_put(&j->io);
+       closure_put(&w->io);
        percpu_ref_put(&ca->io_ref);
 }
 
 static CLOSURE_CALLBACK(do_journal_write)
 {
-       closure_type(j, struct journal, io);
+       closure_type(w, struct journal_buf, io);
+       struct journal *j = container_of(w, struct journal, buf[w->idx]);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       struct journal_buf *w = journal_last_unwritten_buf(j);
-       struct bio *bio;
        unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
        extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct journal_device *ja = &ca->journal;
+
                if (!percpu_ref_tryget(&ca->io_ref)) {
                        /* XXX: fix this */
                        bch_err(c, "missing device for journal write\n");
@@ -1706,7 +1739,7 @@ static CLOSURE_CALLBACK(do_journal_write)
                this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
                             sectors);
 
-               bio = ca->journal.bio;
+               struct bio *bio = &ja->bio[w->idx]->bio;
                bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = ptr->offset;
                bio->bi_end_io          = journal_write_endio;
@@ -1725,11 +1758,10 @@ static CLOSURE_CALLBACK(do_journal_write)
                trace_and_count(c, journal_write, bio);
                closure_bio_submit(bio, cl);
 
-               ca->journal.bucket_seq[ca->journal.cur_idx] =
-                       le64_to_cpu(w->data->seq);
+               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
        }
 
-       continue_at(cl, journal_write_done, c->io_complete_wq);
+       continue_at(cl, journal_write_done, j->wq);
 }
 
 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1802,6 +1834,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
        end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
+       struct jset_entry_datetime *d =
+               container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+       d->entry.type   = BCH_JSET_ENTRY_datetime;
+       d->seconds      = cpu_to_le64(ktime_get_real_seconds());
+
        bch2_journal_super_entries_add_common(c, &end, seq);
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1901,16 +1938,16 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 CLOSURE_CALLBACK(bch2_journal_write)
 {
-       closure_type(j, struct journal, io);
+       closure_type(w, struct journal_buf, io);
+       struct journal *j = container_of(w, struct journal, buf[w->idx]);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *w = journal_last_unwritten_buf(j);
        struct bch_replicas_padded replicas;
-       struct bio *bio;
        struct printbuf journal_debug_buf = PRINTBUF;
        unsigned nr_rw_members = 0;
        int ret;
 
        BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+       BUG_ON(w->write_allocated);
 
        j->write_start_time = local_clock();
 
@@ -1954,12 +1991,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
         * bch2_journal_space_available():
         */
        w->sectors = 0;
+       w->write_allocated = true;
 
        /*
         * journal entry has been compacted and allocated, recalculate space
         * available:
         */
        bch2_journal_space_available(j);
+       bch2_journal_do_writes(j);
        spin_unlock(&j->lock);
 
        w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1983,25 +2022,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
        if (ret)
                goto err;
 
+       if (!JSET_NO_FLUSH(w->data))
+               closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
        if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
                for_each_rw_member(c, ca) {
                        percpu_ref_get(&ca->io_ref);
 
-                       bio = ca->journal.bio;
+                       struct journal_device *ja = &ca->journal;
+                       struct bio *bio = &ja->bio[w->idx]->bio;
                        bio_reset(bio, ca->disk_sb.bdev,
-                                 REQ_OP_WRITE|REQ_PREFLUSH);
+                                 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
                        bio->bi_end_io          = journal_write_endio;
                        bio->bi_private         = ca;
                        closure_bio_submit(bio, cl);
                }
        }
 
-       continue_at(cl, do_journal_write, c->io_complete_wq);
+       continue_at(cl, do_journal_write, j->wq);
        return;
 no_io:
-       continue_at(cl, journal_write_done, c->io_complete_wq);
+       continue_at(cl, journal_write_done, j->wq);
        return;
 err:
        bch2_fatal_error(c);
-       continue_at(cl, journal_write_done, c->io_complete_wq);
+       continue_at(cl, journal_write_done, j->wq);
 }
index c035e7c108e19012e6e4e1f708136dec27b5387c..f18b90000cc5db24f11ec8f3595557c22cce53ab 100644 (file)
@@ -2,19 +2,22 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
+#include <linux/darray_types.h>
+
+struct journal_ptr {
+       bool            csum_good;
+       u8              dev;
+       u32             bucket;
+       u32             bucket_offset;
+       u64             sector;
+};
+
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
  */
 struct journal_replay {
-       struct journal_ptr {
-               bool            csum_good;
-               u8              dev;
-               u32             bucket;
-               u32             bucket_offset;
-               u64             sector;
-       }                       ptrs[BCH_REPLICAS_MAX];
-       unsigned                nr_ptrs;
+       DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
 
        bool                    csum_good;
        bool                    ignore;
@@ -62,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
 
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+       memset(entry, 0, u64s * sizeof(u64));
+       /*
+        * The u64s field counts from the start of data, ignoring the shared
+        * fields.
+        */
+       entry->u64s = cpu_to_le16(u64s - 1);
+
+       *end = vstruct_next(*end);
+       return entry;
+}
+
 #endif /* _BCACHEFS_JOURNAL_IO_H */
index 820d25e19e5fe3ee6a45e70f23eb74fc1d558e88..f29fd39794ac30b8bc33ea1c121d9278e99bb740 100644 (file)
@@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
                ? BCH_WATERMARK_reclaim
                : BCH_WATERMARK_stripe;
 
-       if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
-                              &j->low_on_space_start, low_on_space) ||
-           track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-                              &j->low_on_pin_start, low_on_pin) ||
-           track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
-                              &j->write_buffer_full_start, low_on_wb))
+       if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+           track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+           track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
                trace_and_count(c, journal_full, c);
 
        swap(watermark, j->watermark);
@@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j,
                           struct journal_entry_pin *src,
                           journal_pin_flush_fn flush_fn)
 {
-       bool reclaim;
-
        spin_lock(&j->lock);
 
        u64 seq = READ_ONCE(src->seq);
@@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j,
                return;
        }
 
-       reclaim = __journal_pin_drop(j, dst);
+       bool reclaim = __journal_pin_drop(j, dst);
 
        bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
        if (reclaim)
                bch2_journal_reclaim_fast(j);
-       spin_unlock(&j->lock);
 
        /*
         * If the journal is currently full,  we might want to call flush_fn
         * immediately:
         */
-       journal_wake(j);
+       if (seq == journal_last_seq(j))
+               journal_wake(j);
+       spin_unlock(&j->lock);
 }
 
 void bch2_journal_pin_set(struct journal *j, u64 seq,
                          struct journal_entry_pin *pin,
                          journal_pin_flush_fn flush_fn)
 {
-       bool reclaim;
-
        spin_lock(&j->lock);
 
        BUG_ON(seq < journal_last_seq(j));
 
-       reclaim = __journal_pin_drop(j, pin);
+       bool reclaim = __journal_pin_drop(j, pin);
 
        bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
        if (reclaim)
                bch2_journal_reclaim_fast(j);
-       spin_unlock(&j->lock);
-
        /*
         * If the journal is currently full,  we might want to call flush_fn
         * immediately:
         */
-       journal_wake(j);
+       if (seq == journal_last_seq(j))
+               journal_wake(j);
+
+       spin_unlock(&j->lock);
 }
 
 /**
index ae4fb8c3a2bc26fe937c5bc88f8b5b78143e91b0..156691c203bef6ceeeb9b57bd3a66544db1f4491 100644 (file)
@@ -2,8 +2,8 @@
 
 #include "bcachefs.h"
 #include "journal_sb.h"
-#include "darray.h"
 
+#include <linux/darray.h>
 #include <linux/sort.h>
 
 /* BCH_SB_FIELD_journal: */
index 0200e299cfbb9c210d144bb056f1e85a910fe70f..024c9b1b323f842879d923bc827ce1d7764d6e0a 100644 (file)
@@ -2,10 +2,11 @@
 
 #include "bcachefs.h"
 #include "btree_iter.h"
-#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
 #include "super-io.h"
 
+#include <linux/eytzinger.h>
+
 /*
  * journal_seq_blacklist machinery:
  *
@@ -119,8 +120,7 @@ out:
        return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-static int journal_seq_blacklist_table_cmp(const void *_l,
-                                          const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 {
        const struct journal_seq_blacklist_table_entry *l = _l;
        const struct journal_seq_blacklist_table_entry *r = _r;
index 38817c7a0851592c67c591f9a7a425d58152a004..011f7a0d4ebd8cd1b88c2a9c483d33138aaff592 100644 (file)
@@ -18,6 +18,7 @@
  * the journal that are being staged or in flight.
  */
 struct journal_buf {
+       struct closure          io;
        struct jset             *data;
 
        __BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
        unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                   buf_size was bigger */
        unsigned                u64s_reserved;
-       bool                    noflush;        /* write has already been kicked off, and was noflush */
-       bool                    must_flush;     /* something wants a flush */
-       bool                    separate_flush;
-       bool                    need_flush_to_write_buffer;
+       bool                    noflush:1;      /* write has already been kicked off, and was noflush */
+       bool                    must_flush:1;   /* something wants a flush */
+       bool                    separate_flush:1;
+       bool                    need_flush_to_write_buffer:1;
+       bool                    write_started:1;
+       bool                    write_allocated:1;
+       bool                    write_done:1;
+       u8                      idx;
 };
 
 /*
@@ -134,6 +139,7 @@ enum journal_flags {
 /* Reasons we may fail to get a journal reservation: */
 #define JOURNAL_ERRORS()               \
        x(ok)                           \
+       x(retry)                        \
        x(blocked)                      \
        x(max_in_flight)                \
        x(journal_full)                 \
@@ -149,6 +155,13 @@ enum journal_errors {
 
 typedef DARRAY(u64)            darray_u64;
 
+struct journal_bio {
+       struct bch_dev          *ca;
+       unsigned                buf_idx;
+
+       struct bio              bio;
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
        /* Fastpath stuff up front: */
@@ -203,8 +216,8 @@ struct journal {
        wait_queue_head_t       wait;
        struct closure_waitlist async_wait;
 
-       struct closure          io;
        struct delayed_work     write_work;
+       struct workqueue_struct *wq;
 
        /* Sequence number of most recent journal entry (last entry in @pin) */
        atomic64_t              seq;
@@ -274,14 +287,9 @@ struct journal {
        u64                     nr_noflush_writes;
        u64                     entry_bytes_written;
 
-       u64                     low_on_space_start;
-       u64                     low_on_pin_start;
-       u64                     max_in_flight_start;
-       u64                     write_buffer_full_start;
-
-       struct bch2_time_stats  *flush_write_time;
-       struct bch2_time_stats  *noflush_write_time;
-       struct bch2_time_stats  *flush_seq_time;
+       struct time_stats       *flush_write_time;
+       struct time_stats       *noflush_write_time;
+       struct time_stats       *flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map      res_map;
@@ -313,7 +321,7 @@ struct journal_device {
        u64                     *buckets;
 
        /* Bio for journal reads/writes to this device */
-       struct bio              *bio;
+       struct journal_bio      *bio[JOURNAL_BUF_NR];
 
        /* for bch_journal_read_device */
        struct closure          read;
index 5623cee3ef8693413ee51d7dd521c496e90f206c..69098eeb5d48e3a06236bb7ad8aead21fcc19679 100644 (file)
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
        nr_good = bch2_bkey_durability(c, k.s_c);
        if ((!nr_good && !(flags & lost)) ||
            (nr_good < replicas && !(flags & degraded)))
-               return -EINVAL;
+               return -BCH_ERR_remove_would_lose_data;
 
        return 0;
 }
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
        /* don't handle this yet: */
        if (flags & BCH_FORCE_IF_METADATA_LOST)
-               return -EINVAL;
+               return -BCH_ERR_remove_with_metadata_missing_unimplemented;
 
        trans = bch2_trans_get(c);
        bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
 
                        ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
                                            dev_idx, flags, true);
-                       if (ret) {
-                               bch_err(c, "Cannot drop device without losing data");
+                       if (ret)
                                break;
-                       }
 
                        ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
index 3c21981a4a1c09f9c70596876ef71fdef72cddcb..181efa4a83fa12be1f5ae8e657c522a6a77aee44 100644 (file)
@@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
                u64 start_time = local_clock();
 
                __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-               bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+               time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
        }
 }
 
index cc2672c120312c39f82e9a1a9afe0ed959b15dba..75fdce373f764039df417cb4bcd7f760b5f49f61 100644 (file)
@@ -6,12 +6,15 @@
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
                                            struct bch_replicas_cpu *);
 
 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r,  const void *priv)
 {
+       size_t size = (size_t) priv;
        return memcmp(l, r, size);
 }
 
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-       eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+       eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+                         bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 {
        unsigned i;
 
-       sort_cmp_size(cpu_r->entries,
-                     cpu_r->nr,
-                     cpu_r->entry_size,
-                     bch2_memcmp, NULL);
+       sort_r(cpu_r->entries,
+              cpu_r->nr,
+              cpu_r->entry_size,
+              bch2_memcmp, NULL,
+              (void *)(size_t)cpu_r->entry_size);
 
        for (i = 0; i < cpu_r->nr; i++) {
                struct bch_replicas_entry_v1 *e =
index 654a4b26d3a3c96e3ac0cecb9586de15828665f1..983cce782ac2a2519620c07096ba39233812b968 100644 (file)
@@ -3,9 +3,10 @@
 #define _BCACHEFS_REPLICAS_H
 
 #include "bkey.h"
-#include "eytzinger.h"
 #include "replicas_types.h"
 
+#include <linux/eytzinger.h>
+
 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
                                 struct bch_replicas_entry_v1 *);
index b6bf0ebe7e84046a5d08ade7d34bae9ae0bff3a5..5980ba2563fe9fa159ba9d87fe08ab2dc53a78fb 100644 (file)
@@ -171,22 +171,6 @@ fsck_err:
        return ERR_PTR(ret);
 }
 
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-       struct jset_entry *entry = *end;
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-       memset(entry, 0, u64s * sizeof(u64));
-       /*
-        * The u64s field counts from the start of data, ignoring the shared
-        * fields.
-        */
-       entry->u64s = cpu_to_le16(u64s - 1);
-
-       *end = vstruct_next(*end);
-       return entry;
-}
-
 void bch2_journal_super_entries_add_common(struct bch_fs *c,
                                           struct jset_entry **end,
                                           u64 journal_seq)
index 441dcb1bf160e917d531d1a5ea955cf0238f0844..626eaaea5b01d7923a8fc79d5bda3c48876b3c92 100644 (file)
@@ -6,12 +6,13 @@
  */
 
 #include "bcachefs.h"
-#include "darray.h"
 #include "recovery.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "super-io.h"
 
+#include <linux/darray.h>
+
 #define RECOVERY_PASS_ALL_FSCK         BIT_ULL(63)
 
 /*
index c08aacdfd073c203e44a072363c94e89dd93eec8..63f18c7f30885e4a6f95b992e5a448b81f7fbdc4 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_ERRORS_TYPES_H
 #define _BCACHEFS_SB_ERRORS_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 #define BCH_SB_ERRS()                                                  \
        x(clean_but_journal_not_empty,                          0)      \
        x(hash_table_key_duplicate,                             242)    \
        x(hash_table_key_wrong_offset,                          243)    \
        x(unlinked_inode_not_on_deleted_list,                   244)    \
-       x(reflink_p_front_pad_bad,                              245)
+       x(reflink_p_front_pad_bad,                              245)    \
+       x(journal_entry_dup_same_device,                        246)    \
+       x(inode_bi_subvol_missing,                              247)    \
+       x(inode_bi_subvol_wrong,                                248)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
index be0a941832715a32634b8c3dea60bbf1685a672f..e4d4d842229a6b6cff63bbaab167d4dc7512a7f9 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
-#include "darray.h"
+#include <linux/darray.h>
 
 extern char * const bch2_member_error_strs[];
 
index 89fdb7c21134ebbb6c145a88ed5b1943ab54588a..3976f80721bf1b40736a3d882c9f841fa7ab961b 100644 (file)
@@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
 }
 
 static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
                 struct btree_iter *iter,
                 const struct bch_hash_desc desc,
                 const struct bch_hash_info *info,
                 subvol_inum inum, const void *key,
-                unsigned flags)
+                unsigned flags, u32 snapshot)
 {
        struct bkey_s_c k;
-       u32 snapshot;
        int ret;
 
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               return ret;
-
        for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
                           SPOS(inum.inum, desc.hash_key(info, key), snapshot),
                           POS(inum.inum, U64_MAX),
@@ -194,6 +189,19 @@ bch2_hash_lookup(struct btree_trans *trans,
        return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
 }
 
+static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+                struct btree_iter *iter,
+                const struct bch_hash_desc desc,
+                const struct bch_hash_info *info,
+                subvol_inum inum, const void *key,
+                unsigned flags)
+{
+       u32 snapshot;
+       return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+               bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
 static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
               struct btree_iter *iter,
@@ -251,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
                           const struct bch_hash_desc desc,
                           const struct bch_hash_info *info,
                           subvol_inum inum, u32 snapshot,
@@ -320,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
                  struct bkey_i *insert,
                  bch_str_hash_flags_t str_hash_flags)
 {
-       u32 snapshot;
-       int ret;
-
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               return ret;
-
        insert->k.p.inode = inum.inum;
 
-       return bch2_hash_set_snapshot(trans, desc, info, inum,
-                                     snapshot, insert, str_hash_flags, 0);
+       u32 snapshot;
+       return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+               bch2_hash_set_in_snapshot(trans, desc, info, inum,
+                                         snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline
index 7c67c28d3ef88ff32d1805257faf37ebc79f0d2d..e7ee52c39990cc8dff2dafbd928a0da51f0f6d50 100644 (file)
@@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans,
                return ret ?: -BCH_ERR_transaction_restart_nested;
        }
 
+       struct bch_inode_unpacked inode;
+       struct btree_iter inode_iter = {};
+       ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+                                   (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+                                   0);
+       bch2_trans_iter_exit(trans, &inode_iter);
+
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               return ret;
+
+       if (fsck_err_on(ret, c, subvol_to_missing_root,
+                       "subvolume %llu points to missing subvolume root %llu:%u",
+                       k.k->p.offset, le64_to_cpu(subvol.v->inode),
+                       le32_to_cpu(subvol.v->snapshot))) {
+               ret = bch2_subvolume_delete(trans, iter->pos.offset);
+               bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+               return ret ?: -BCH_ERR_transaction_restart_nested;
+       }
+
+       if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+                       c, subvol_root_wrong_bi_subvol,
+                       "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+                       inode.bi_inum, inode_iter.k.p.snapshot,
+                       inode.bi_subvol, subvol.k->p.offset)) {
+               inode.bi_subvol = subvol.k->p.offset;
+               ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+               if (ret)
+                       goto err;
+       }
+
        if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
                u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
                u32 snapshot_tree;
@@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans,
                }
        }
 
+err:
 fsck_err:
        return ret;
 }
index a6f56f66e27cb7699402f089ef89a2f1355077c4..3ca1d183369c5f439f42ef222cacd28be34b6c98 100644 (file)
@@ -2,7 +2,6 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
-#include "darray.h"
 #include "subvolume_types.h"
 
 enum bkey_invalid_flags;
index ae644adfc391680d85b6fe53c25f08ae9337e037..40f16e3a6dd04f1e14ae493665c07eb86e9b2737 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 typedef DARRAY(u32) snapshot_id_list;
 
index 95e80e06316bf49873d64d4dc79cc766df0023a0..f37620919e11a57f4b97c08789d593a3faae5011 100644 (file)
@@ -3,12 +3,12 @@
 #define _BCACHEFS_SUPER_IO_H
 
 #include "extents.h"
-#include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
 #include "sb-members.h"
 
 #include <asm/byteorder.h>
+#include <linux/eytzinger.h>
 
 static inline bool bch2_version_compatible(u16 version)
 {
index da8697c79a97e7d2c2f35056b707103d65618c5b..68704a86f649993fa39de399a65c60d0154678a4 100644 (file)
@@ -67,6 +67,7 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
+#include <linux/thread_with_file.h>
 #include <crypto/hash.h>
 
 MODULE_LICENSE("GPL");
@@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
        if (likely(!stdio)) {
                vprintk(fmt, args);
        } else {
-               unsigned long flags;
-
                if (fmt[0] == KERN_SOH[0])
                        fmt += 2;
 
-               spin_lock_irqsave(&stdio->output_lock, flags);
-               prt_vprintf(&stdio->output_buf, fmt, args);
-               spin_unlock_irqrestore(&stdio->output_lock, flags);
-
-               wake_up(&stdio->output_wait);
+               stdio_redirect_vprintf(stdio, true, fmt, args);
        }
        va_end(args);
 }
@@ -520,7 +515,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        unsigned i;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
-               bch2_time_stats_exit(&c->times[i]);
+               time_stats_exit(&c->times[i]);
 
        bch2_free_pending_node_rewrites(c);
        bch2_fs_sb_errors_exit(c);
@@ -576,7 +571,7 @@ static void __bch2_fs_free(struct bch_fs *c)
                destroy_workqueue(c->btree_update_wq);
 
        bch2_free_super(&c->disk_sb);
-       kvpfree(c, sizeof(*c));
+       kvfree(c);
        module_put(THIS_MODULE);
 }
 
@@ -715,7 +710,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        unsigned i, iter_size;
        int ret = 0;
 
-       c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+       c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
        if (!c) {
                c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
                goto out;
@@ -753,7 +748,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->journal_keys.initial_ref_held = true;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
-               bch2_time_stats_init(&c->times[i]);
+               time_stats_init(&c->times[i]);
 
        bch2_fs_copygc_init(c);
        bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@@ -882,8 +877,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
-           mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-                                       c->opts.btree_node_size) ||
+           mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+                                      c->opts.btree_node_size) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
            !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
                                              sizeof(u64), GFP_KERNEL))) {
@@ -1124,7 +1119,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
                prt_newline(&buf);
 
                prt_bdevname(&buf, fs->bdev);
-               prt_str(&buf, "believes seq of ");
+               prt_str(&buf, " believes seq of ");
                prt_bdevname(&buf, sb->bdev);
                prt_printf(&buf, " to be %llu, but ", seq_from_fs);
                prt_bdevname(&buf, sb->bdev);
@@ -1168,8 +1163,8 @@ static void bch2_dev_free(struct bch_dev *ca)
        bch2_dev_buckets_free(ca);
        free_page((unsigned long) ca->sb_read_scratch);
 
-       bch2_time_stats_exit(&ca->io_latency[WRITE]);
-       bch2_time_stats_exit(&ca->io_latency[READ]);
+       time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+       time_stats_quantiles_exit(&ca->io_latency[READ]);
 
        percpu_ref_exit(&ca->io_ref);
        percpu_ref_exit(&ca->ref);
@@ -1260,8 +1255,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
        INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-       bch2_time_stats_init(&ca->io_latency[READ]);
-       bch2_time_stats_init(&ca->io_latency[WRITE]);
+       time_stats_quantiles_init(&ca->io_latency[READ]);
+       time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
        ca->mi = bch2_mi_to_cpu(member);
 
index cee80c47feea2b27fa7d18fc55a39228db7f0b96..c86a93a8d8fc81bbe373efcbec74f3e2563e6da5 100644 (file)
@@ -930,10 +930,10 @@ SHOW(bch2_dev)
        sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
 
        if (attr == &sysfs_io_latency_stats_read)
-               bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+               bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
 
        if (attr == &sysfs_io_latency_stats_write)
-               bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+               bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
 
        sysfs_printf(congested,                 "%u%%",
                     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
deleted file mode 100644 (file)
index b1c867a..0000000
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "printbuf.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
-       if (thr->task) {
-               kthread_stop(thr->task);
-               put_task_struct(thr->task);
-       }
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
-                             const struct file_operations *fops,
-                             int (*fn)(void *))
-{
-       struct file *file = NULL;
-       int ret, fd = -1;
-       unsigned fd_flags = O_CLOEXEC;
-
-       if (fops->read && fops->write)
-               fd_flags |= O_RDWR;
-       else if (fops->read)
-               fd_flags |= O_RDONLY;
-       else if (fops->write)
-               fd_flags |= O_WRONLY;
-
-       char name[TASK_COMM_LEN];
-       get_task_comm(name, current);
-
-       thr->ret = 0;
-       thr->task = kthread_create(fn, thr, "%s", name);
-       ret = PTR_ERR_OR_ZERO(thr->task);
-       if (ret)
-               return ret;
-
-       ret = get_unused_fd_flags(fd_flags);
-       if (ret < 0)
-               goto err;
-       fd = ret;
-
-       file = anon_inode_getfile(name, fops, thr, fd_flags);
-       ret = PTR_ERR_OR_ZERO(file);
-       if (ret)
-               goto err;
-
-       fd_install(fd, file);
-       get_task_struct(thr->task);
-       wake_up_process(thr->task);
-       return fd;
-err:
-       if (fd >= 0)
-               put_unused_fd(fd);
-       if (thr->task)
-               kthread_stop(thr->task);
-       return ret;
-}
-
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
-{
-       return thr->stdio.output_buf.pos ||
-               thr->output2.nr ||
-               thr->thr.done;
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
-                                     size_t len, loff_t *ppos)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-       size_t copied = 0, b;
-       int ret = 0;
-
-       if ((file->f_flags & O_NONBLOCK) &&
-           !thread_with_stdio_has_output(thr))
-               return -EAGAIN;
-
-       ret = wait_event_interruptible(thr->stdio.output_wait,
-               thread_with_stdio_has_output(thr));
-       if (ret)
-               return ret;
-
-       if (thr->thr.done)
-               return 0;
-
-       while (len) {
-               ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
-               if (ret)
-                       break;
-
-               spin_lock_irq(&thr->stdio.output_lock);
-               b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
-
-               memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
-               memmove(thr->stdio.output_buf.buf,
-                       thr->stdio.output_buf.buf + b,
-                       thr->stdio.output_buf.pos - b);
-
-               thr->output2.nr += b;
-               thr->stdio.output_buf.pos -= b;
-               spin_unlock_irq(&thr->stdio.output_lock);
-
-               b = min(len, thr->output2.nr);
-               if (!b)
-                       break;
-
-               b -= copy_to_user(buf, thr->output2.data, b);
-               if (!b) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               copied  += b;
-               buf     += b;
-               len     -= b;
-
-               memmove(thr->output2.data,
-                       thr->output2.data + b,
-                       thr->output2.nr - b);
-               thr->output2.nr -= b;
-       }
-
-       return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       bch2_thread_with_file_exit(&thr->thr);
-       printbuf_exit(&thr->stdio.input_buf);
-       printbuf_exit(&thr->stdio.output_buf);
-       darray_exit(&thr->output2);
-       thr->exit(thr);
-       return 0;
-}
-
-#define WRITE_BUFFER           4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
-       return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
-                                      size_t len, loff_t *ppos)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-       struct printbuf *buf = &thr->stdio.input_buf;
-       size_t copied = 0;
-       ssize_t ret = 0;
-
-       while (len) {
-               if (thr->thr.done) {
-                       ret = -EPIPE;
-                       break;
-               }
-
-               size_t b = len - fault_in_readable(ubuf, len);
-               if (!b) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               spin_lock(&thr->stdio.input_lock);
-               if (buf->pos < WRITE_BUFFER)
-                       bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
-               b = min(len, printbuf_remaining_size(buf));
-
-               if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
-                       ubuf += b;
-                       len -= b;
-                       copied += b;
-                       buf->pos += b;
-               }
-               spin_unlock(&thr->stdio.input_lock);
-
-               if (b) {
-                       wake_up(&thr->stdio.input_wait);
-               } else {
-                       if ((file->f_flags & O_NONBLOCK)) {
-                               ret = -EAGAIN;
-                               break;
-                       }
-
-                       ret = wait_event_interruptible(thr->stdio.input_wait,
-                                       thread_with_stdio_has_input_space(thr));
-                       if (ret)
-                               break;
-               }
-       }
-
-       return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       poll_wait(file, &thr->stdio.output_wait, wait);
-       poll_wait(file, &thr->stdio.input_wait, wait);
-
-       __poll_t mask = 0;
-
-       if (thread_with_stdio_has_output(thr))
-               mask |= EPOLLIN;
-       if (thread_with_stdio_has_input_space(thr))
-               mask |= EPOLLOUT;
-       if (thr->thr.done)
-               mask |= EPOLLHUP|EPOLLERR;
-       return mask;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
-       .release        = thread_with_stdio_release,
-       .read           = thread_with_stdio_read,
-       .write          = thread_with_stdio_write,
-       .poll           = thread_with_stdio_poll,
-       .llseek         = no_llseek,
-};
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-                              void (*exit)(struct thread_with_stdio *),
-                              int (*fn)(void *))
-{
-       thr->stdio.input_buf = PRINTBUF;
-       thr->stdio.input_buf.atomic++;
-       spin_lock_init(&thr->stdio.input_lock);
-       init_waitqueue_head(&thr->stdio.input_wait);
-
-       thr->stdio.output_buf = PRINTBUF;
-       thr->stdio.output_buf.atomic++;
-       spin_lock_init(&thr->stdio.output_lock);
-       init_waitqueue_head(&thr->stdio.output_wait);
-
-       darray_init(&thr->output2);
-       thr->exit = exit;
-
-       return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
-}
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
-{
-       wait_event(stdio->input_wait,
-                  stdio->input_buf.pos || stdio->done);
-
-       if (stdio->done)
-               return -1;
-
-       spin_lock(&stdio->input_lock);
-       int ret = min(len, stdio->input_buf.pos);
-       stdio->input_buf.pos -= ret;
-       memcpy(buf, stdio->input_buf.buf, ret);
-       memmove(stdio->input_buf.buf,
-               stdio->input_buf.buf + ret,
-               stdio->input_buf.pos);
-       spin_unlock(&stdio->input_lock);
-
-       wake_up(&stdio->input_wait);
-       return ret;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
-{
-       wait_event(stdio->input_wait,
-                  stdio->input_buf.pos || stdio->done);
-
-       if (stdio->done)
-               return -1;
-
-       spin_lock(&stdio->input_lock);
-       int ret = min(len, stdio->input_buf.pos);
-       char *n = memchr(stdio->input_buf.buf, '\n', ret);
-       if (n)
-               ret = min(ret, n + 1 - stdio->input_buf.buf);
-       stdio->input_buf.pos -= ret;
-       memcpy(buf, stdio->input_buf.buf, ret);
-       memmove(stdio->input_buf.buf,
-               stdio->input_buf.buf + ret,
-               stdio->input_buf.pos);
-       spin_unlock(&stdio->input_lock);
-
-       wake_up(&stdio->input_wait);
-       return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h
deleted file mode 100644 (file)
index 05879c5..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-struct task_struct;
-
-struct thread_with_file {
-       struct task_struct      *task;
-       int                     ret;
-       bool                    done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
-                             const struct file_operations *,
-                             int (*fn)(void *));
-
-struct thread_with_stdio {
-       struct thread_with_file thr;
-       struct stdio_redirect   stdio;
-       DARRAY(char)            output2;
-       void                    (*exit)(struct thread_with_stdio *);
-};
-
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-       thr->thr.done = true;
-       thr->stdio.done = true;
-       wake_up(&thr->stdio.input_wait);
-       wake_up(&thr->stdio.output_wait);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-                              void (*exit)(struct thread_with_stdio *),
-                              int (*fn)(void *));
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h
deleted file mode 100644 (file)
index 90b5e64..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-struct stdio_redirect {
-       spinlock_t              output_lock;
-       wait_queue_head_t       output_wait;
-       struct printbuf         output_buf;
-
-       spinlock_t              input_lock;
-       wait_queue_head_t       input_wait;
-       struct printbuf         input_buf;
-       bool                    done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
index 56b815fd9fc6ee5a541aa8e7007f3c00025c493d..539735033947b91f4bd26768d51ca3049512dd45 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/eytzinger.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/log2.h>
@@ -22,9 +23,8 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
 
-#include "eytzinger.h"
-#include "mean_and_variance.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
@@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
 }
 #endif
 
-static const struct time_unit {
-       const char      *name;
-       u64             nsecs;
-} time_units[] = {
-       { "ns",         1                },
-       { "us",         NSEC_PER_USEC    },
-       { "ms",         NSEC_PER_MSEC    },
-       { "s",          NSEC_PER_SEC     },
-       { "m",          (u64) NSEC_PER_SEC * 60},
-       { "h",          (u64) NSEC_PER_SEC * 3600},
-       { "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-       const struct time_unit *u;
-
-       for (u = time_units;
-            u + 1 < time_units + ARRAY_SIZE(time_units) &&
-            ns >= u[1].nsecs << 1;
-            u++)
-               ;
-
-       return u;
-}
-
 void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
        const struct time_unit *u = pick_time_units(ns);
@@ -370,120 +344,6 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
        prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
-       unsigned i = 0;
-
-       while (i < ARRAY_SIZE(q->entries)) {
-               struct bch2_quantile_entry *e = q->entries + i;
-
-               if (unlikely(!e->step)) {
-                       e->m = v;
-                       e->step = max_t(unsigned, v / 2, 1024);
-               } else if (e->m > v) {
-                       e->m = e->m >= e->step
-                               ? e->m - e->step
-                               : 0;
-               } else if (e->m < v) {
-                       e->m = e->m + e->step > e->m
-                               ? e->m + e->step
-                               : U32_MAX;
-               }
-
-               if ((e->m > v ? e->m - v : v - e->m) < e->step)
-                       e->step = max_t(unsigned, e->step / 2, 1);
-
-               if (v >= e->m)
-                       break;
-
-               i = eytzinger0_child(i, v > e->m);
-       }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
-                                             u64 start, u64 end)
-{
-       u64 duration, freq;
-
-       if (time_after64(end, start)) {
-               duration = end - start;
-               mean_and_variance_update(&stats->duration_stats, duration);
-               mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
-               stats->max_duration = max(stats->max_duration, duration);
-               stats->min_duration = min(stats->min_duration, duration);
-               stats->total_duration += duration;
-               bch2_quantiles_update(&stats->quantiles, duration);
-       }
-
-       if (time_after64(end, stats->last_event)) {
-               freq = end - stats->last_event;
-               mean_and_variance_update(&stats->freq_stats, freq);
-               mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
-               stats->max_freq = max(stats->max_freq, freq);
-               stats->min_freq = min(stats->min_freq, freq);
-               stats->last_event = end;
-       }
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-                                          struct bch2_time_stat_buffer *b)
-{
-       for (struct bch2_time_stat_buffer_entry *i = b->entries;
-            i < b->entries + ARRAY_SIZE(b->entries);
-            i++)
-               bch2_time_stats_update_one(stats, i->start, i->end);
-       b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-                                                 struct bch2_time_stat_buffer *b)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&stats->lock, flags);
-       __bch2_time_stats_clear_buffer(stats, b);
-       spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-       unsigned long flags;
-
-       WARN_ONCE(!stats->duration_stats_weighted.weight ||
-                 !stats->freq_stats_weighted.weight,
-                 "uninitialized time_stats");
-
-       if (!stats->buffer) {
-               spin_lock_irqsave(&stats->lock, flags);
-               bch2_time_stats_update_one(stats, start, end);
-
-               if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
-                   stats->duration_stats.n > 1024)
-                       stats->buffer =
-                               alloc_percpu_gfp(struct bch2_time_stat_buffer,
-                                                GFP_ATOMIC);
-               spin_unlock_irqrestore(&stats->lock, flags);
-       } else {
-               struct bch2_time_stat_buffer *b;
-
-               preempt_disable();
-               b = this_cpu_ptr(stats->buffer);
-
-               BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-               b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
-                       .start = start,
-                       .end = end
-               };
-
-               if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-                       bch2_time_stats_clear_buffer(stats, b);
-               preempt_enable();
-       }
-}
-
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
        const struct time_unit *u = pick_time_units(ns);
@@ -503,19 +363,18 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 #define TABSTOP_SIZE 12
 
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 {
-       const struct time_unit *u;
+       struct quantiles *quantiles = time_stats_to_quantiles(stats);
        s64 f_mean = 0, d_mean = 0;
-       u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
-       int i;
+       u64 f_stddev = 0, d_stddev = 0;
 
        if (stats->buffer) {
                int cpu;
 
                spin_lock_irq(&stats->lock);
                for_each_possible_cpu(cpu)
-                       __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+                       __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
                spin_unlock_irq(&stats->lock);
        }
 
@@ -570,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
        prt_tab(out);
        bch2_pr_time_units_aligned(out, d_mean);
        prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
        prt_newline(out);
 
        prt_printf(out, "stddev:");
        prt_tab(out);
        bch2_pr_time_units_aligned(out, d_stddev);
        prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 
        printbuf_indent_sub(out, 2);
        prt_newline(out);
@@ -593,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
        prt_tab(out);
        bch2_pr_time_units_aligned(out, f_mean);
        prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
        prt_newline(out);
 
        prt_printf(out, "stddev:");
        prt_tab(out);
        bch2_pr_time_units_aligned(out, f_stddev);
        prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 
        printbuf_indent_sub(out, 2);
        prt_newline(out);
 
        printbuf_tabstops_reset(out);
 
-       i = eytzinger0_first(NR_QUANTILES);
-       u = pick_time_units(stats->quantiles.entries[i].m);
-
-       prt_printf(out, "quantiles (%s):\t", u->name);
-       eytzinger0_for_each(i, NR_QUANTILES) {
-               bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
-               q = max(stats->quantiles.entries[i].m, last_q);
-               prt_printf(out, "%llu ",
-                      div_u64(q, u->nsecs));
-               if (is_last)
-                       prt_newline(out);
-               last_q = q;
+       if (quantiles) {
+               int i = eytzinger0_first(NR_QUANTILES);
+               const struct time_unit *u =
+                       pick_time_units(quantiles->entries[i].m);
+               u64 last_q = 0;
+
+               prt_printf(out, "quantiles (%s):\t", u->name);
+               eytzinger0_for_each(i, NR_QUANTILES) {
+                       bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+                       u64 q = max(quantiles->entries[i].m, last_q);
+                       prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+                       if (is_last)
+                               prt_newline(out);
+                       last_q = q;
+               }
        }
 }
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-       free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-       memset(stats, 0, sizeof(*stats));
-       stats->duration_stats_weighted.weight = 8;
-       stats->freq_stats_weighted.weight = 8;
-       stats->min_duration = U64_MAX;
-       stats->min_freq = U64_MAX;
-       spin_lock_init(&stats->lock);
-}
 
 /* ratelimit: */
 
@@ -863,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
        }
 }
 
-static int alignment_ok(const void *base, size_t align)
-{
-       return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-               ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-       u32 t = *(u32 *)a;
-       *(u32 *)a = *(u32 *)b;
-       *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
-       u64 t = *(u64 *)a;
-       *(u64 *)a = *(u64 *)b;
-       *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-       char t;
-
-       do {
-               t = *(char *)a;
-               *(char *)a++ = *(char *)b;
-               *(char *)b++ = t;
-       } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
-                        int (*cmp_func)(const void *, const void *, size_t),
-                        size_t l, size_t r)
-{
-       return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
-                       base + inorder_to_eytzinger0(r, n) * size,
-                       size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
-                          void (*swap_func)(void *, void *, size_t),
-                          size_t l, size_t r)
-{
-       swap_func(base + inorder_to_eytzinger0(l, n) * size,
-                 base + inorder_to_eytzinger0(r, n) * size,
-                 size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-                    int (*cmp_func)(const void *, const void *, size_t),
-                    void (*swap_func)(void *, void *, size_t))
-{
-       int i, c, r;
-
-       if (!swap_func) {
-               if (size == 4 && alignment_ok(base, 4))
-                       swap_func = u32_swap;
-               else if (size == 8 && alignment_ok(base, 8))
-                       swap_func = u64_swap;
-               else
-                       swap_func = generic_swap;
-       }
-
-       /* heapify */
-       for (i = n / 2 - 1; i >= 0; --i) {
-               for (r = i; r * 2 + 1 < n; r = c) {
-                       c = r * 2 + 1;
-
-                       if (c + 1 < n &&
-                           do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-                               c++;
-
-                       if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-                               break;
-
-                       do_swap(base, n, size, swap_func, r, c);
-               }
-       }
-
-       /* sort */
-       for (i = n - 1; i > 0; --i) {
-               do_swap(base, n, size, swap_func, 0, i);
-
-               for (r = 0; r * 2 + 1 < i; r = c) {
-                       c = r * 2 + 1;
-
-                       if (c + 1 < i &&
-                           do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-                               c++;
-
-                       if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-                               break;
-
-                       do_swap(base, n, size, swap_func, r, c);
-               }
-       }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-         int (*cmp_func)(const void *, const void *, size_t),
-         void (*swap_func)(void *, void *, size_t size))
-{
-       /* pre-scale counters for performance */
-       int i = (num/2 - 1) * size, n = num * size, c, r;
-
-       if (!swap_func) {
-               if (size == 4 && alignment_ok(base, 4))
-                       swap_func = u32_swap;
-               else if (size == 8 && alignment_ok(base, 8))
-                       swap_func = u64_swap;
-               else
-                       swap_func = generic_swap;
-       }
-
-       /* heapify */
-       for ( ; i >= 0; i -= size) {
-               for (r = i; r * 2 + size < n; r  = c) {
-                       c = r * 2 + size;
-                       if (c < n - size &&
-                           cmp_func(base + c, base + c + size, size) < 0)
-                               c += size;
-                       if (cmp_func(base + r, base + c, size) >= 0)
-                               break;
-                       swap_func(base + r, base + c, size);
-               }
-       }
-
-       /* sort */
-       for (i = n - size; i > 0; i -= size) {
-               swap_func(base, base + i, size);
-               for (r = 0; r * 2 + size < i; r = c) {
-                       c = r * 2 + size;
-                       if (c < i - size &&
-                           cmp_func(base + c, base + c + size, size) < 0)
-                               c += size;
-                       if (cmp_func(base + r, base + c, size) >= 0)
-                               break;
-                       swap_func(base + r, base + c, size);
-               }
-       }
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
-       size_t size = (size_t) pool_data;
-
-       vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
-       size_t size = (size_t) pool_data;
-
-       return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-       return size < PAGE_SIZE
-               ? mempool_init_kmalloc_pool(pool, min_nr, size)
-               : mempool_init(pool, min_nr, mempool_alloc_vp,
-                              mempool_free_vp, (void *) size);
-}
-
 #if 0
 void eytzinger1_test(void)
 {
index b414736d59a5b36d1344657eaeb6de6113ec5a09..1b3aced8d83caf63f867553d87ab091fab807a54 100644 (file)
@@ -5,22 +5,21 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/closure.h>
+#include <linux/darray.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
-#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
+#include <linux/time_stats.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
+#include <linux/mean_and_variance.h>
 
 struct closure;
 
@@ -53,38 +52,6 @@ static inline size_t buf_pages(void *p, size_t len)
                            PAGE_SIZE);
 }
 
-static inline void vpfree(void *p, size_t size)
-{
-       if (is_vmalloc_addr(p))
-               vfree(p);
-       else
-               free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
-       return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-                                        get_order(size)) ?:
-               __vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
-       if (size < PAGE_SIZE)
-               kfree(p);
-       else
-               vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
-       return size < PAGE_SIZE
-               ? kmalloc(size, gfp_mask)
-               : vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
 #define HEAP(type)                                                     \
 struct {                                                               \
        size_t size, used;                                              \
@@ -97,13 +64,13 @@ struct {                                                            \
 ({                                                                     \
        (heap)->used = 0;                                               \
        (heap)->size = (_size);                                         \
-       (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+       (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
                                 (gfp));                                \
 })
 
 #define free_heap(heap)                                                        \
 do {                                                                   \
-       kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));  \
+       kvfree((heap)->data);                                           \
        (heap)->data = NULL;                                            \
 } while (0)
 
@@ -361,83 +328,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 #endif
 }
 
-#define NR_QUANTILES   15
-#define QUANTILE_IDX(i)        inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST  eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
-       struct bch2_quantile_entry {
-               u64     m;
-               u64     step;
-       }               entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
-       unsigned        nr;
-       struct bch2_time_stat_buffer_entry {
-               u64     start;
-               u64     end;
-       }               entries[32];
-};
-
-struct bch2_time_stats {
-       spinlock_t      lock;
-       /* all fields are in nanoseconds */
-       u64             min_duration;
-       u64             max_duration;
-       u64             total_duration;
-       u64             max_freq;
-       u64             min_freq;
-       u64             last_event;
-       struct bch2_quantiles quantiles;
-
-       struct mean_and_variance          duration_stats;
-       struct mean_and_variance_weighted duration_stats_weighted;
-       struct mean_and_variance          freq_stats;
-       struct mean_and_variance_weighted freq_stats_weighted;
-       struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-       __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
-                                     u64 *start, bool v)
-{
-       if (v != !!*start) {
-               if (!v) {
-                       bch2_time_stats_update(stats, *start);
-                       *start = 0;
-               } else {
-                       *start = local_clock() ?: 1;
-                       return true;
-               }
-       }
-
-       return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
-                                     u64 *start, bool v)
-{
-       bool ret = v && !*start;
-       *start = v;
-       return ret;
-}
-#endif
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
 
 #define ewma_add(ewma, val, weight)                                    \
 ({                                                                     \
@@ -738,34 +629,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
        memset(s + bytes, c, rem);
 }
 
-void sort_cmp_size(void *base, size_t num, size_t size,
-         int (*cmp_func)(const void *, const void *, size_t),
-         void (*swap_func)(void *, void *, size_t));
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos)                         \
-       memmove(&(_array)[(_pos) + 1],                                  \
-               &(_array)[(_pos)],                                      \
-               sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item)                        \
-do {                                                                   \
-       __array_insert_item(_array, _nr, _pos);                         \
-       (_nr)++;                                                        \
-       (_array)[(_pos)] = (_new_item);                                 \
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)           \
-do {                                                                   \
-       (_nr) -= (_nr_to_remove);                                       \
-       memmove(&(_array)[(_pos)],                                      \
-               &(_array)[(_pos) + (_nr_to_remove)],                    \
-               sizeof((_array)[0]) * ((_nr) - (_pos)));                \
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)                           \
-       array_remove_items(_array, _nr, _pos, 1)
-
 static inline void __move_gap(void *array, size_t element_size,
                              size_t nr, size_t size,
                              size_t old_gap, size_t new_gap)
similarity index 68%
rename from libbcachefs/darray.c
rename to linux/darray.c
index ac35b8b705ae1c076e780af570bd824d87c28ab2..80e77959a886b7ebcbf35341705ae76b08fd35e2 100644 (file)
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
 
+#include <linux/darray.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
-#include "darray.h"
 
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
 {
        if (new_size > d->size) {
                new_size = roundup_pow_of_two(new_size);
similarity index 93%
rename from libbcachefs/mean_and_variance.c
rename to linux/mean_and_variance.c
index bf0ef668fd38324132b737e648e3ffcb143bbe92..b93d150ddf801628f4d08d0b6b6ef3892ed74c24 100644 (file)
 #include <linux/limits.h>
 #include <linux/math.h>
 #include <linux/math64.h>
+#include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
-#include "mean_and_variance.h"
-
 u128_u u128_div(u128_u n, u64 d)
 {
        u128_u r;
@@ -107,10 +106,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+               s64 x, bool initted, u8 weight)
 {
        // previous weighted variance.
-       u8 w            = s->weight;
+       u8 w            = weight;
        u64 var_w0      = s->variance;
        // new value weighted.
        s64 x_w         = x << w;
@@ -119,14 +119,13 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
        // new mean weighted.
        s64 u_w1        = s->mean + diff;
 
-       if (!s->init) {
+       if (!initted) {
                s->mean = x_w;
                s->variance = 0;
        } else {
                s->mean = u_w1;
                s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
        }
-       s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
@@ -134,9 +133,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
  * mean_and_variance_weighted_get_mean() - get mean from @s
  * @s: mean and variance number of samples and their sums
  */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+               u8 weight)
 {
-       return fast_divpow2(s.mean, s.weight);
+       return fast_divpow2(s.mean, weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
@@ -144,10 +144,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
  * mean_and_variance_weighted_get_variance() -- get variance from @s
  * @s: mean and variance number of samples and their sums
  */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+               u8 weight)
 {
        // always positive don't need fast divpow2
-       return s.variance >> s.weight;
+       return s.variance >> weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
@@ -155,9 +156,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
  * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
  * @s: mean and variance number of samples and their sums
  */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+               u8 weight)
 {
-       return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+       return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
 
index 74d4fbb34af77143c4e57de9be0d9db6ff5617fb..74ed17bf017c9624381501eb729fb1990cf61e92 100644 (file)
@@ -522,6 +522,19 @@ void mempool_kfree(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_kfree);
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
+{
+       size_t size = (size_t)pool_data;
+       return kvmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kvmalloc);
+
+void mempool_kvfree(void *element, void *pool_data)
+{
+       kvfree(element);
+}
+EXPORT_SYMBOL(mempool_kvfree);
+
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.
diff --git a/linux/sort.c b/linux/sort.c
new file mode 100644 (file)
index 0000000..ffa4817
--- /dev/null
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
+ *
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/sort.h>
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+       unsigned char lsbits = (unsigned char)size;
+
+       (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+       return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+       do {
+               u32 t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+       } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+       do {
+#ifdef CONFIG_64BIT
+               u64 t = *(u64 *)(a + (n -= 8));
+               *(u64 *)(a + n) = *(u64 *)(b + n);
+               *(u64 *)(b + n) = t;
+#else
+               /* Use two 32-bit transfers to avoid base+index+4 addressing */
+               u32 t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+
+               t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+#endif
+       } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+       do {
+               char t = ((char *)a)[--n];
+               ((char *)a)[n] = ((char *)b)[n];
+               ((char *)b)[n] = t;
+       } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES    (swap_r_func_t)2
+#define SWAP_WRAPPER  (swap_r_func_t)3
+
+struct wrapper {
+       cmp_func_t cmp;
+       swap_func_t swap;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+       if (swap_func == SWAP_WRAPPER) {
+               ((const struct wrapper *)priv)->swap(a, b, (int)size);
+               return;
+       }
+
+       if (swap_func == SWAP_WORDS_64)
+               swap_words_64(a, b, size);
+       else if (swap_func == SWAP_WORDS_32)
+               swap_words_32(a, b, size);
+       else if (swap_func == SWAP_BYTES)
+               swap_bytes(a, b, size);
+       else
+               swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+       if (cmp == _CMP_WRAPPER)
+               return ((const struct wrapper *)priv)->cmp(a, b);
+       return cmp(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+       i -= size;
+       i -= size & -(i & lsbit);
+       return i / 2;
+}
+
+/**
+ * sort_r - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
+ *
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * quicksort is slightly faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+void sort_r(void *base, size_t num, size_t size,
+           cmp_r_func_t cmp_func,
+           swap_r_func_t swap_func,
+           const void *priv)
+{
+       /* pre-scale counters for performance */
+       size_t n = num * size, a = (num/2) * size;
+       const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+       if (!a)         /* num < 2 || size == 0 */
+               return;
+
+       /* called from 'sort' without swap function, let's pick the default */
+       if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+               swap_func = NULL;
+
+       if (!swap_func) {
+               if (is_aligned(base, size, 8))
+                       swap_func = SWAP_WORDS_64;
+               else if (is_aligned(base, size, 4))
+                       swap_func = SWAP_WORDS_32;
+               else
+                       swap_func = SWAP_BYTES;
+       }
+
+       /*
+        * Loop invariants:
+        * 1. elements [a,n) satisfy the heap property (compare greater than
+        *    all of their children),
+        * 2. elements [n,num*size) are sorted, and
+        * 3. a <= b <= c <= d <= n (whenever they are valid).
+        */
+       for (;;) {
+               size_t b, c, d;
+
+               if (a)                  /* Building heap: sift down --a */
+                       a -= size;
+               else if (n -= size)     /* Sorting: Extract root to --n */
+                       do_swap(base, base + n, size, swap_func, priv);
+               else                    /* Sort complete */
+                       break;
+
+               /*
+                * Sift element at "a" down into heap.  This is the
+                * "bottom-up" variant, which significantly reduces
+                * calls to cmp_func(): we find the sift-down path all
+                * the way to the leaves (one compare per level), then
+                * backtrack to find where to insert the target element.
+                *
+                * Because elements tend to sift down close to the leaves,
+                * this uses fewer compares than doing two per level
+                * on the way down.  (A bit more than half as many on
+                * average, 3/4 worst-case.)
+                */
+               for (b = a; c = 2*b + size, (d = c + size) < n;)
+                       b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+               if (d == n)     /* Special case last leaf with no sibling */
+                       b = c;
+
+               /* Now backtrack from "b" to the correct location for "a" */
+               while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
+                       b = parent(b, lsbit, size);
+               c = b;                  /* Where "a" belongs */
+               while (b != a) {        /* Shift it into place */
+                       b = parent(b, lsbit, size);
+                       do_swap(base + b, base + c, size, swap_func, priv);
+               }
+       }
+}
+EXPORT_SYMBOL(sort_r);
+
+#include <linux/eytzinger.h>
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+                        cmp_r_func_t cmp_func, const void *priv,
+                        size_t l, size_t r)
+{
+       return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+                     base + inorder_to_eytzinger0(r, n) * size,
+                     cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+                          swap_r_func_t swap_func, const void *priv,
+                          size_t l, size_t r)
+{
+       do_swap(base + inorder_to_eytzinger0(l, n) * size,
+               base + inorder_to_eytzinger0(r, n) * size,
+               size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+                      cmp_r_func_t cmp_func,
+                      swap_r_func_t swap_func,
+                      const void *priv)
+{
+       int i, c, r;
+
+       /* called from 'sort' without swap function, let's pick the default */
+       if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+               swap_func = NULL;
+
+       if (!swap_func) {
+               if (is_aligned(base, size, 8))
+                       swap_func = SWAP_WORDS_64;
+               else if (is_aligned(base, size, 4))
+                       swap_func = SWAP_WORDS_32;
+               else
+                       swap_func = SWAP_BYTES;
+       }
+
+       /* heapify */
+       for (i = n / 2 - 1; i >= 0; --i) {
+               for (r = i; r * 2 + 1 < n; r = c) {
+                       c = r * 2 + 1;
+
+                       if (c + 1 < n &&
+                           eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+                               c++;
+
+                       if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+                               break;
+
+                       eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+               }
+       }
+
+       /* sort */
+       for (i = n - 1; i > 0; --i) {
+               eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+               for (r = 0; r * 2 + 1 < i; r = c) {
+                       c = r * 2 + 1;
+
+                       if (c + 1 < i &&
+                           eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+                               c++;
+
+                       if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+                               break;
+
+                       eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+                    cmp_func_t cmp_func,
+                    swap_func_t swap_func)
+{
+       struct wrapper w = {
+               .cmp  = cmp_func,
+               .swap = swap_func,
+       };
+
+       return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort);
diff --git a/linux/time_stats.c b/linux/time_stats.c
new file mode 100644 (file)
index 0000000..0b90c80
--- /dev/null
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/eytzinger.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/time_stats.h>
+#include <linux/spinlock.h>
+
+static const struct time_unit time_units[] = {
+       { "ns",         1                },
+       { "us",         NSEC_PER_USEC    },
+       { "ms",         NSEC_PER_MSEC    },
+       { "s",          NSEC_PER_SEC     },
+       { "m",          (u64) NSEC_PER_SEC * 60},
+       { "h",          (u64) NSEC_PER_SEC * 3600},
+       { "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+       { "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+       { "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+       { "eon",        U64_MAX          },
+};
+
+const struct time_unit *pick_time_units(u64 ns)
+{
+       const struct time_unit *u;
+
+       for (u = time_units;
+            u + 1 < time_units + ARRAY_SIZE(time_units) &&
+            ns >= u[1].nsecs << 1;
+            u++)
+               ;
+
+       return u;
+}
+EXPORT_SYMBOL_GPL(pick_time_units);
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+       unsigned i = 0;
+
+       while (i < ARRAY_SIZE(q->entries)) {
+               struct quantile_entry *e = q->entries + i;
+
+               if (unlikely(!e->step)) {
+                       e->m = v;
+                       e->step = max_t(unsigned, v / 2, 1024);
+               } else if (e->m > v) {
+                       e->m = e->m >= e->step
+                               ? e->m - e->step
+                               : 0;
+               } else if (e->m < v) {
+                       e->m = e->m + e->step > e->m
+                               ? e->m + e->step
+                               : U32_MAX;
+               }
+
+               if ((e->m > v ? e->m - v : v - e->m) < e->step)
+                       e->step = max_t(unsigned, e->step / 2, 1);
+
+               if (v >= e->m)
+                       break;
+
+               i = eytzinger0_child(i, v > e->m);
+       }
+}
+
+static inline void time_stats_update_one(struct time_stats *stats,
+                                             u64 start, u64 end)
+{
+       u64 duration, freq;
+       bool initted = stats->last_event != 0;
+
+       if (time_after64(end, start)) {
+               struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+               duration = end - start;
+               mean_and_variance_update(&stats->duration_stats, duration);
+               mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+                               duration, initted, TIME_STATS_MV_WEIGHT);
+               stats->max_duration = max(stats->max_duration, duration);
+               stats->min_duration = min(stats->min_duration, duration);
+               stats->total_duration += duration;
+
+               if (quantiles)
+                       quantiles_update(quantiles, duration);
+       }
+
+       if (stats->last_event && time_after64(end, stats->last_event)) {
+               freq = end - stats->last_event;
+               mean_and_variance_update(&stats->freq_stats, freq);
+               mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+                               freq, initted, TIME_STATS_MV_WEIGHT);
+               stats->max_freq = max(stats->max_freq, freq);
+               stats->min_freq = min(stats->min_freq, freq);
+       }
+
+       stats->last_event = end;
+}
+
+void __time_stats_clear_buffer(struct time_stats *stats,
+                              struct time_stat_buffer *b)
+{
+       for (struct time_stat_buffer_entry *i = b->entries;
+            i < b->entries + ARRAY_SIZE(b->entries);
+            i++)
+               time_stats_update_one(stats, i->start, i->end);
+       b->nr = 0;
+}
+EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
+
+static noinline void time_stats_clear_buffer(struct time_stats *stats,
+                                            struct time_stat_buffer *b)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&stats->lock, flags);
+       __time_stats_clear_buffer(stats, b);
+       spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+       unsigned long flags;
+
+       if (!stats->buffer) {
+               spin_lock_irqsave(&stats->lock, flags);
+               time_stats_update_one(stats, start, end);
+
+               if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+                   stats->duration_stats.n > 1024)
+                       stats->buffer =
+                               alloc_percpu_gfp(struct time_stat_buffer,
+                                                GFP_ATOMIC);
+               spin_unlock_irqrestore(&stats->lock, flags);
+       } else {
+               struct time_stat_buffer *b;
+
+               preempt_disable();
+               b = this_cpu_ptr(stats->buffer);
+
+               BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+               b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+                       .start = start,
+                       .end = end
+               };
+
+               if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+                       time_stats_clear_buffer(stats, b);
+               preempt_enable();
+       }
+}
+EXPORT_SYMBOL_GPL(__time_stats_update);
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+       const struct time_unit *u = pick_time_units(ns);
+
+       seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+static inline u64 time_stats_lifetime(const struct time_stats *stats)
+{
+       return local_clock() - stats->start_time;
+}
+
+void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
+               const char *epoch_name, unsigned int flags)
+{
+       struct quantiles *quantiles = time_stats_to_quantiles(stats);
+       s64 f_mean = 0, d_mean = 0;
+       u64 f_stddev = 0, d_stddev = 0;
+       u64 lifetime = time_stats_lifetime(stats);
+
+       if (stats->buffer) {
+               int cpu;
+
+               spin_lock_irq(&stats->lock);
+               for_each_possible_cpu(cpu)
+                       __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+               spin_unlock_irq(&stats->lock);
+       }
+
+       if (stats->freq_stats.n) {
+               /* avoid divide by zero */
+               f_mean = mean_and_variance_get_mean(stats->freq_stats);
+               f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+               d_mean = mean_and_variance_get_mean(stats->duration_stats);
+               d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+       } else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+               /* unless we didn't want zeroes anyway */
+               return;
+       }
+
+       seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+       seq_buf_printf(out, "lifetime: ");
+       seq_buf_time_units_aligned(out, lifetime);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "                       since %-12s recent\n", epoch_name);
+
+       seq_buf_printf(out, "duration of events\n");
+
+       seq_buf_printf(out, "  min:                     ");
+       seq_buf_time_units_aligned(out, stats->min_duration);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  max:                     ");
+       seq_buf_time_units_aligned(out, stats->max_duration);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  total:                   ");
+       seq_buf_time_units_aligned(out, stats->total_duration);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  mean:                    ");
+       seq_buf_time_units_aligned(out, d_mean);
+       seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  stddev:                  ");
+       seq_buf_time_units_aligned(out, d_stddev);
+       seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "time between events\n");
+
+       seq_buf_printf(out, "  min:                     ");
+       seq_buf_time_units_aligned(out, stats->min_freq);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  max:                     ");
+       seq_buf_time_units_aligned(out, stats->max_freq);
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  mean:                    ");
+       seq_buf_time_units_aligned(out, f_mean);
+       seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+       seq_buf_printf(out, "\n");
+
+       seq_buf_printf(out, "  stddev:                  ");
+       seq_buf_time_units_aligned(out, f_stddev);
+       seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+       seq_buf_printf(out, "\n");
+
+       if (quantiles) {
+               int i = eytzinger0_first(NR_QUANTILES);
+               const struct time_unit *u =
+                       pick_time_units(quantiles->entries[i].m);
+               u64 last_q = 0;
+
+               seq_buf_printf(out, "quantiles (%s):\t", u->name);
+               eytzinger0_for_each(i, NR_QUANTILES) {
+                       bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+                       u64 q = max(quantiles->entries[i].m, last_q);
+                       seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+                       if (is_last)
+                               seq_buf_printf(out, "\n");
+                       last_q = q;
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
+
+void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
+               const char *epoch_name, unsigned int flags)
+{
+       struct quantiles *quantiles = time_stats_to_quantiles(stats);
+       s64 f_mean = 0, d_mean = 0;
+       u64 f_stddev = 0, d_stddev = 0;
+
+       if (stats->buffer) {
+               int cpu;
+
+               spin_lock_irq(&stats->lock);
+               for_each_possible_cpu(cpu)
+                       __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+               spin_unlock_irq(&stats->lock);
+       }
+
+       if (stats->freq_stats.n) {
+               /* avoid divide by zero */
+               f_mean = mean_and_variance_get_mean(stats->freq_stats);
+               f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+               d_mean = mean_and_variance_get_mean(stats->duration_stats);
+               d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+       } else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+               /* unless we didn't want zeroes anyway */
+               return;
+       }
+
+       seq_buf_printf(out, "{\n");
+       seq_buf_printf(out, "  \"epoch\":       \"%s\",\n", epoch_name);
+       seq_buf_printf(out, "  \"count\":       %llu,\n", stats->duration_stats.n);
+
+       seq_buf_printf(out, "  \"duration_ns\": {\n");
+       seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_duration);
+       seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_duration);
+       seq_buf_printf(out, "    \"total\":     %llu,\n", stats->total_duration);
+       seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+       seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+       seq_buf_printf(out, "  },\n");
+
+       d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+       d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+       seq_buf_printf(out, "  \"duration_ewma_ns\": {\n");
+       seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+       seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+       seq_buf_printf(out, "  },\n");
+
+       seq_buf_printf(out, "  \"frequency_ns\": {\n");
+       seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_freq);
+       seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_freq);
+       seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+       seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+       seq_buf_printf(out, "  },\n");
+
+       f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+       f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+       seq_buf_printf(out, "  \"frequency_ewma_ns\": {\n");
+       seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+       seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+
+       if (quantiles) {
+               u64 last_q = 0;
+
+               /* close frequency_ewma_ns but signal more items */
+               seq_buf_printf(out, "  },\n");
+
+               seq_buf_printf(out, "  \"quantiles_ns\": [\n");
+               eytzinger0_for_each(i, NR_QUANTILES) {
+                       bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+                       u64 q = max(quantiles->entries[i].m, last_q);
+                       seq_buf_printf(out, "    %llu", q);
+                       if (!is_last)
+                               seq_buf_printf(out, ", ");
+                       last_q = q;
+               }
+               seq_buf_printf(out, "  ]\n");
+       } else {
+               /* close frequency_ewma_ns without dumping further */
+               seq_buf_printf(out, "  }\n");
+       }
+
+       seq_buf_printf(out, "}\n");
+}
+EXPORT_SYMBOL_GPL(time_stats_to_json);
+
+void time_stats_exit(struct time_stats *stats)
+{
+       free_percpu(stats->buffer);
+}
+EXPORT_SYMBOL_GPL(time_stats_exit);
+
+void time_stats_init(struct time_stats *stats)
+{
+       memset(stats, 0, sizeof(*stats));
+       stats->min_duration = U64_MAX;
+       stats->min_freq = U64_MAX;
+       stats->start_time = local_clock();
+       spin_lock_init(&stats->lock);
+}
+EXPORT_SYMBOL_GPL(time_stats_init);
+
+MODULE_AUTHOR("Kent Overstreet");
+MODULE_LICENSE("GPL");
index 336a029f5fae9632eba3e0520729bc1c511ed8f5..48148a8fd1b8da77488f2080c3d0cd4fb147f8bc 100644 (file)
@@ -61,7 +61,7 @@ impl BcachefsHandle {
     pub fn create_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
         let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
         self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             dst_ptr: dst.as_ptr() as u64,
             ..Default::default()
@@ -73,7 +73,7 @@ impl BcachefsHandle {
     pub fn delete_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
         let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
         self.ioctl(BcachefsIoctl::SubvolumeDestroy, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             dst_ptr: dst.as_ptr() as u64,
             ..Default::default()
@@ -88,7 +88,7 @@ impl BcachefsHandle {
 
         let res = self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
             flags: BCH_SUBVOL_SNAPSHOT_CREATE | extra_flags,
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             src_ptr: src.as_ref().map_or(0, |x| x.as_ptr() as u64),
             //src_ptr: if let Some(src) = src { src.as_ptr() } else { std::ptr::null() } as u64,