From f3f005c76eb5636542a8f5b137bd1904d57e8f86 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 23:09:25 -0500 Subject: [PATCH] Update bcachefs sources to 50847e296b34 bcachefs: Check subvol <-> inode pointers in check_inode() Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- Makefile | 19 +- c_src/cmd_format.c | 3 +- c_src/cmd_fs.c | 3 +- c_src/tools-util.h | 2 +- {libbcachefs => include/linux}/darray.h | 59 ++- include/linux/darray_types.h | 22 ++ {libbcachefs => include/linux}/eytzinger.h | 56 +-- .../linux}/mean_and_variance.h | 16 +- include/linux/mempool.h | 13 + include/linux/spinlock.h | 66 +--- include/linux/spinlock_types.h | 65 +++ include/linux/thread_with_file.h | 15 + include/linux/thread_with_file_types.h | 0 include/linux/time.h | 0 include/linux/time_stats.h | 167 ++++++++ include/linux/types.h | 5 + libbcachefs/alloc_foreground.c | 13 +- libbcachefs/bcachefs.h | 13 +- libbcachefs/bcachefs_format.h | 8 +- libbcachefs/bcachefs_ioctl.h | 2 +- libbcachefs/bset.c | 2 +- libbcachefs/btree_cache.c | 20 +- libbcachefs/btree_gc.c | 19 +- libbcachefs/btree_io.c | 12 +- libbcachefs/btree_iter.c | 14 +- libbcachefs/btree_journal_iter.c | 50 ++- libbcachefs/btree_journal_iter.h | 14 +- libbcachefs/btree_locking.h | 2 +- libbcachefs/btree_types.h | 2 +- libbcachefs/btree_update.c | 2 + libbcachefs/btree_update_interior.c | 10 +- libbcachefs/btree_write_buffer_types.h | 2 +- libbcachefs/buckets.c | 29 +- libbcachefs/chardev.c | 24 +- libbcachefs/compress.c | 14 +- libbcachefs/debug.c | 6 +- libbcachefs/dirent.c | 108 ++--- libbcachefs/dirent.h | 2 +- libbcachefs/ec.c | 4 +- libbcachefs/errcode.h | 2 + libbcachefs/error.c | 4 +- libbcachefs/fifo.h | 4 +- libbcachefs/fs-common.c | 19 +- libbcachefs/fs-ioctl.c | 4 +- libbcachefs/fs.c | 218 ++++++---- libbcachefs/fsck.c | 248 ++++++------ libbcachefs/inode.c | 30 +- libbcachefs/inode.h | 5 + libbcachefs/io_read.c | 4 +- libbcachefs/io_write.c | 4 +- libbcachefs/journal.c | 248 +++++++----- libbcachefs/journal.h | 7 +- libbcachefs/journal_io.c | 357 +++++++++-------- libbcachefs/journal_io.h | 35 +- libbcachefs/journal_reclaim.c | 29 +- libbcachefs/journal_sb.c | 2 +- libbcachefs/journal_seq_blacklist.c | 6 +- libbcachefs/journal_types.h | 36 +- libbcachefs/migrate.c | 8 +- libbcachefs/nocow_locking.c | 2 +- libbcachefs/replicas.c | 17 +- libbcachefs/replicas.h | 3 +- libbcachefs/sb-clean.c | 16 - libbcachefs/sb-downgrade.c | 3 +- libbcachefs/sb-errors_types.h | 7 +- libbcachefs/sb-members.h | 2 +- libbcachefs/str_hash.h | 37 +- libbcachefs/subvolume.c | 31 ++ libbcachefs/subvolume.h | 1 - libbcachefs/subvolume_types.h | 2 +- libbcachefs/super-io.h | 2 +- libbcachefs/super.c | 31 +- libbcachefs/sysfs.c | 4 +- libbcachefs/thread_with_file.c | 299 -------------- libbcachefs/thread_with_file.h | 41 -- libbcachefs/thread_with_file_types.h | 16 - libbcachefs/util.c | 373 ++---------------- libbcachefs/util.h | 151 +------ {libbcachefs => linux}/darray.c | 7 +- {libbcachefs => linux}/mean_and_variance.c | 26 +- linux/mempool.c | 13 + linux/sort.c | 368 +++++++++++++++++ linux/time_stats.c | 373 ++++++++++++++++++ src/wrappers/handle.rs | 6 +- 85 files changed, 2245 insertions(+), 1741 deletions(-) rename {libbcachefs => include/linux}/darray.h (66%) create mode 100644 include/linux/darray_types.h rename {libbcachefs => include/linux}/eytzinger.h (78%) rename {libbcachefs => include/linux}/mean_and_variance.h (94%) create mode 100644 include/linux/spinlock_types.h create mode 100644 include/linux/thread_with_file.h create mode 100644 include/linux/thread_with_file_types.h create mode 100644 include/linux/time.h create mode 100644 include/linux/time_stats.h delete mode 100644 libbcachefs/thread_with_file.c delete mode 100644 libbcachefs/thread_with_file.h delete mode 100644 libbcachefs/thread_with_file_types.h rename {libbcachefs => linux}/darray.c (68%) rename {libbcachefs => linux}/mean_and_variance.c (93%) create mode 100644 linux/sort.c create mode 100644 linux/time_stats.c diff --git a/.bcachefs_revision b/.bcachefs_revision index 797e133..d3c500b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -481b5f34324809f47a58ed798d038fb17e5b7b0a +50847e296b34efabe199e408ec4d72f10a866c39 diff --git a/Makefile b/Makefile index 5cdb437..876efbb 100644 --- a/Makefile +++ b/Makefile @@ -273,11 +273,20 @@ update-bcachefs-sources: git add include/linux/kmemleak.h cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ git add linux/int_sqrt.c - git rm -f libbcachefs/mean_and_variance_test.c -# cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ -# git add linux/mean_and_variance.c -# cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ -# git add include/linux/mean_and_variance.h + cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ + git add linux/mean_and_variance.c + cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ + git add include/linux/mean_and_variance.h + cp $(LINUX_DIR)/lib/time_stats.c linux/ + git add linux/time_stats.c + cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/ + git add include/linux/time_stats.h + cp $(LINUX_DIR)/include/linux/darray.h include/linux/ + git add include/linux/darray.h + cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/ + git add include/linux/darray_types.h + cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/ + git add include/linux/eytzinger.h cp $(LINUX_DIR)/scripts/Makefile.compiler ./ git add Makefile.compiler $(RM) libbcachefs/*.mod.c diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c index 6b77763..3d29f41 100644 --- a/c_src/cmd_format.c +++ b/c_src/cmd_format.c @@ -23,12 +23,13 @@ #include "cmds.h" #include "libbcachefs.h" #include "crypto.h" -#include "libbcachefs/darray.h" #include "libbcachefs/errcode.h" #include "libbcachefs/opts.h" #include "libbcachefs/super-io.h" #include "libbcachefs/util.h" +#include "linux/darray.h" + #define OPTS \ x(0, replicas, required_argument) \ x(0, encrypted, no_argument) \ diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c index 1a5d144..d8542d6 100644 --- a/c_src/cmd_fs.c +++ b/c_src/cmd_fs.c @@ -9,13 +9,14 @@ #include "libbcachefs/bcachefs_ioctl.h" #include "libbcachefs/buckets.h" -#include "libbcachefs/darray.h" #include "libbcachefs/opts.h" #include "libbcachefs/super-io.h" #include "cmds.h" #include "libbcachefs.h" +#include "linux/darray.h" + static void __dev_usage_type_to_text(struct printbuf *out, enum bch_data_type type, unsigned bucket_size, diff --git a/c_src/tools-util.h b/c_src/tools-util.h index bff3bc6..4682406 100644 --- a/c_src/tools-util.h +++ b/c_src/tools-util.h @@ -20,7 +20,7 @@ #include #include "libbcachefs/bcachefs.h" #include "libbcachefs/bbpos.h" -#include "libbcachefs/darray.h" +#include "linux/darray.h" #define noreturn __attribute__((noreturn)) diff --git a/libbcachefs/darray.h b/include/linux/darray.h similarity index 66% rename from libbcachefs/darray.h rename to include/linux/darray.h index 4b340d1..ff167eb 100644 --- a/libbcachefs/darray.h +++ b/include/linux/darray.h @@ -1,34 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DARRAY_H -#define _BCACHEFS_DARRAY_H +/* + * (C) 2022-2024 Kent Overstreet + */ +#ifndef _LINUX_DARRAY_H +#define _LINUX_DARRAY_H /* - * Dynamic arrays: + * Dynamic arrays * * Inspired by CCAN's darray */ +#include #include -#define DARRAY_PREALLOCATED(_type, _nr) \ -struct { \ - size_t nr, size; \ - _type *data; \ - _type preallocated[_nr]; \ -} - -#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) - -typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; - -int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); +int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t); static inline int __darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { return unlikely(new_size > d->size) - ? __bch2_darray_resize(d, element_size, new_size, gfp) + ? __darray_resize_slowpath(d, element_size, new_size, gfp) : 0; } @@ -69,6 +61,28 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, #define darray_first(_d) ((_d).data[0]) #define darray_last(_d) ((_d).data[(_d).nr - 1]) +/* Insert/remove items into the middle of a darray: */ + +#define array_insert_item(_array, _nr, _pos, _new_item) \ +do { \ + memmove(&(_array)[(_pos) + 1], \ + &(_array)[(_pos)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ + (_nr)++; \ + (_array)[(_pos)] = (_new_item); \ +} while (0) + +#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ +do { \ + (_nr) -= (_nr_to_remove); \ + memmove(&(_array)[(_pos)], \ + &(_array)[(_pos) + (_nr_to_remove)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ +} while (0) + +#define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + #define darray_insert_item(_d, pos, _item) \ ({ \ size_t _pos = (pos); \ @@ -79,10 +93,15 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, _ret; \ }) +#define darray_remove_items(_d, _pos, _nr_to_remove) \ + array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove) + #define darray_remove_item(_d, _pos) \ - array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + darray_remove_items(_d, _pos, 1) + +/* Iteration: */ -#define __darray_for_each(_d, _i) \ +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -106,4 +125,4 @@ do { \ darray_init(_d); \ } while (0) -#endif /* _BCACHEFS_DARRAY_H */ +#endif /* _LINUX_DARRAY_H */ diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h new file mode 100644 index 0000000..a400a0c --- /dev/null +++ b/include/linux/darray_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2022-2024 Kent Overstreet + */ +#ifndef _LINUX_DARRAY_TYpES_H +#define _LINUX_DARRAY_TYpES_H + +#include + +#define DARRAY_PREALLOCATED(_type, _nr) \ +struct { \ + size_t nr, size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) + +typedef DARRAY(char) darray_char; +typedef DARRAY(char *) darray_str; + +#endif /* _LINUX_DARRAY_TYpES_H */ diff --git a/libbcachefs/eytzinger.h b/include/linux/eytzinger.h similarity index 78% rename from libbcachefs/eytzinger.h rename to include/linux/eytzinger.h index b04750d..9565a5c 100644 --- a/libbcachefs/eytzinger.h +++ b/include/linux/eytzinger.h @@ -1,27 +1,37 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _EYTZINGER_H -#define _EYTZINGER_H +#ifndef _LINUX_EYTZINGER_H +#define _LINUX_EYTZINGER_H #include #include -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. + * + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(search, base + i * size) >= 0); } while (n < nr); if (n & 1) { @@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); -#endif /* _EYTZINGER_H */ +#endif /* _LINUX_EYTZINGER_H */ diff --git a/libbcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h similarity index 94% rename from libbcachefs/mean_and_variance.h rename to include/linux/mean_and_variance.h index b2be565..4fcf062 100644 --- a/libbcachefs/mean_and_variance.h +++ b/include/linux/mean_and_variance.h @@ -17,7 +17,7 @@ * Rust and rustc has issues with u128. */ -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) typedef struct { unsigned __int128 v; @@ -154,8 +154,6 @@ struct mean_and_variance { /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 weight; /* base 2 logarithim */ s64 mean; u64 variance; }; @@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s); u64 mean_and_variance_get_variance(struct mean_and_variance s1); u32 mean_and_variance_get_stddev(struct mean_and_variance s); -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 v, bool initted, u8 weight); -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 506da24..3732517 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -90,6 +90,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) (void *) size); } +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); +void mempool_kvfree(void *element, void *pool_data); + +static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + +static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) +{ + return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 6c4a623..28ce667 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -1,65 +1 @@ -#ifndef __TOOLS_LINUX_SPINLOCK_H -#define __TOOLS_LINUX_SPINLOCK_H - -#include -#include - -typedef struct { - pthread_mutex_t lock; -} raw_spinlock_t; - -#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER } - -static inline void raw_spin_lock_init(raw_spinlock_t *lock) -{ - pthread_mutex_init(&lock->lock, NULL); -} - -static inline bool raw_spin_trylock(raw_spinlock_t *lock) -{ - return !pthread_mutex_trylock(&lock->lock); -} - -static inline void raw_spin_lock(raw_spinlock_t *lock) -{ - pthread_mutex_lock(&lock->lock); -} - -static inline void raw_spin_unlock(raw_spinlock_t *lock) -{ - pthread_mutex_unlock(&lock->lock); -} - -#define raw_spin_lock_irq(lock) raw_spin_lock(lock) -#define raw_spin_unlock_irq(lock) raw_spin_unlock(lock) - -#define raw_spin_lock_irqsave(lock, flags) \ -do { \ - flags = 0; \ - raw_spin_lock(lock); \ -} while (0) - -#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock) - -typedef raw_spinlock_t spinlock_t; - -#define __SPIN_LOCK_UNLOCKED(name) __RAW_SPIN_LOCK_UNLOCKED(name) - -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) - -#define spin_lock_init(lock) raw_spin_lock_init(lock) -#define spin_lock(lock) raw_spin_lock(lock) -#define spin_unlock(lock) raw_spin_unlock(lock) - -#define spin_lock_nested(lock, n) spin_lock(lock) - -#define spin_lock_bh(lock) raw_spin_lock(lock) -#define spin_unlock_bh(lock) raw_spin_unlock(lock) - -#define spin_lock_irq(lock) raw_spin_lock(lock) -#define spin_unlock_irq(lock) raw_spin_unlock(lock) - -#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags) -#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags) - -#endif /* __TOOLS_LINUX_SPINLOCK_H */ +#include "linux/spinlock_types.h" diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h new file mode 100644 index 0000000..6c4a623 --- /dev/null +++ b/include/linux/spinlock_types.h @@ -0,0 +1,65 @@ +#ifndef __TOOLS_LINUX_SPINLOCK_H +#define __TOOLS_LINUX_SPINLOCK_H + +#include +#include + +typedef struct { + pthread_mutex_t lock; +} raw_spinlock_t; + +#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER } + +static inline void raw_spin_lock_init(raw_spinlock_t *lock) +{ + pthread_mutex_init(&lock->lock, NULL); +} + +static inline bool raw_spin_trylock(raw_spinlock_t *lock) +{ + return !pthread_mutex_trylock(&lock->lock); +} + +static inline void raw_spin_lock(raw_spinlock_t *lock) +{ + pthread_mutex_lock(&lock->lock); +} + +static inline void raw_spin_unlock(raw_spinlock_t *lock) +{ + pthread_mutex_unlock(&lock->lock); +} + +#define raw_spin_lock_irq(lock) raw_spin_lock(lock) +#define raw_spin_unlock_irq(lock) raw_spin_unlock(lock) + +#define raw_spin_lock_irqsave(lock, flags) \ +do { \ + flags = 0; \ + raw_spin_lock(lock); \ +} while (0) + +#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock) + +typedef raw_spinlock_t spinlock_t; + +#define __SPIN_LOCK_UNLOCKED(name) __RAW_SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) + +#define spin_lock_init(lock) raw_spin_lock_init(lock) +#define spin_lock(lock) raw_spin_lock(lock) +#define spin_unlock(lock) raw_spin_unlock(lock) + +#define spin_lock_nested(lock, n) spin_lock(lock) + +#define spin_lock_bh(lock) raw_spin_lock(lock) +#define spin_unlock_bh(lock) raw_spin_unlock(lock) + +#define spin_lock_irq(lock) raw_spin_lock(lock) +#define spin_unlock_irq(lock) raw_spin_unlock(lock) + +#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags) +#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags) + +#endif /* __TOOLS_LINUX_SPINLOCK_H */ diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h new file mode 100644 index 0000000..2a66e76 --- /dev/null +++ b/include/linux/thread_with_file.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2022-2024 Kent Overstreet + */ +#ifndef _LINUX_THREAD_WITH_FILE_H +#define _LINUX_THREAD_WITH_FILE_H + +struct stdio_redirect; + +__printf(3, 0) +static inline void stdio_redirect_vprintf(struct stdio_redirect *s, bool nonblocking, const char *msg, va_list args) {} +__printf(3, 4) +static inline void stdio_redirect_printf(struct stdio_redirect *s, bool nonblocking, const char *msg, ...) {} + +#endif /* _LINUX_THREAD_WITH_FILE_H */ diff --git a/include/linux/thread_with_file_types.h b/include/linux/thread_with_file_types.h new file mode 100644 index 0000000..e69de29 diff --git a/include/linux/time.h b/include/linux/time.h new file mode 100644 index 0000000..e69de29 diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h new file mode 100644 index 0000000..6df2b34 --- /dev/null +++ b/include/linux/time_stats.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * time_stats - collect statistics on events that have a duration, with nicely + * formatted textual output on demand + * + * - percpu buffering of event collection: cheap enough to shotgun + * everywhere without worrying about overhead + * + * tracks: + * - number of events + * - maximum event duration ever seen + * - sum of all event durations + * - average event duration, standard and weighted + * - standard deviation of event durations, standard and weighted + * and analagous statistics for the frequency of events + * + * We provide both mean and weighted mean (exponentially weighted), and standard + * deviation and weighted standard deviation, to give an efficient-to-compute + * view of current behaviour versus. average behaviour - "did this event source + * just become wonky, or is this typical?". + * + * Particularly useful for tracking down latency issues. + */ +#ifndef _LINUX_TIME_STATS_H +#define _LINUX_TIME_STATS_H + +#include +#include +#include +#include + +struct time_unit { + const char *name; + u64 nsecs; +}; + +/* + * given a nanosecond value, pick the preferred time units for printing: + */ +const struct time_unit *pick_time_units(u64 ns); + +/* + * quantiles - do not use: + * + * Only enabled if time_stats->quantiles_enabled has been manually set - don't + * use in new code. + */ + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[31]; +}; + +struct time_stats { + spinlock_t lock; + bool have_quantiles; + /* all fields are in nanoseconds */ + u64 min_duration; + u64 max_duration; + u64 total_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + u64 last_event_start; + + struct mean_and_variance duration_stats; + struct mean_and_variance freq_stats; + +/* default weight for weighted mean and variance calculations */ +#define TIME_STATS_MV_WEIGHT 8 + + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance_weighted freq_stats_weighted; + struct time_stat_buffer __percpu *buffer; + + u64 start_time; +}; + +struct time_stats_quantiles { + struct time_stats stats; + struct quantiles quantiles; +}; + +static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats) +{ + return stats->have_quantiles + ? &container_of(stats, struct time_stats_quantiles, stats)->quantiles + : NULL; +} + +void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *); +void __time_stats_update(struct time_stats *stats, u64, u64); + +/** + * time_stats_update - collect a new event being tracked + * + * @stats - time_stats to update + * @start - start time of event, recorded with local_clock() + * + * The end duration of the event will be the current time + */ +static inline void time_stats_update(struct time_stats *stats, u64 start) +{ + __time_stats_update(stats, start, local_clock()); +} + +/** + * track_event_change - track state change events + * + * @stats - time_stats to update + * @v - new state, true or false + * + * Use this when tracking time stats for state changes, i.e. resource X becoming + * blocked/unblocked. + */ +static inline bool track_event_change(struct time_stats *stats, bool v) +{ + if (v != !!stats->last_event_start) { + if (!v) { + time_stats_update(stats, stats->last_event_start); + stats->last_event_start = 0; + } else { + stats->last_event_start = local_clock() ?: 1; + return true; + } + } + + return false; +} + +#define TIME_STATS_PRINT_NO_ZEROES (1U << 0) /* print nothing if zero count */ +struct seq_buf; +void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *, + const char *epoch_name, unsigned int flags); +void time_stats_to_json(struct seq_buf *, struct time_stats *, + const char *epoch_name, unsigned int flags); + +void time_stats_exit(struct time_stats *); +void time_stats_init(struct time_stats *); + +static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq) +{ + time_stats_exit(&statq->stats); +} +static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq) +{ + time_stats_init(&statq->stats); + statq->stats.have_quantiles = true; + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); +} + +#endif /* _LINUX_TIME_STATS_H */ diff --git a/include/linux/types.h b/include/linux/types.h index ce454e2..6ae97c4 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -8,6 +8,7 @@ #include #include #include +#include #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ #include @@ -77,6 +78,10 @@ typedef __u64 __bitwise __be64; typedef u64 sector_t; +typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv); +typedef void (*swap_func_t)(void *a, void *b, int size); + +typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); typedef unsigned int __bitwise slab_flags_t; diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 633d322..ca58193 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, false); - - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, false); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; @@ -555,8 +551,7 @@ again: goto again; } - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index b80c6c9..7036949 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -200,6 +200,8 @@ #include #include #include +#include +#include #include #include #include @@ -465,7 +467,6 @@ enum bch_time_stats { #include "replicas_types.h" #include "subvolume_types.h" #include "super_types.h" -#include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -593,7 +594,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct bch2_time_stats io_latency[2]; + struct time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -640,8 +641,8 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { - struct bch2_time_stats duration; - struct bch2_time_stats lock_hold_times; + struct time_stats duration; + struct time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; unsigned journal_entries_size; @@ -919,8 +920,6 @@ struct bch_fs { /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; - u64 blocked_allocate; - u64 blocked_allocate_open_bucket; open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; @@ -1104,7 +1103,7 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; - struct bch2_time_stats times[BCH_TIME_STAT_NR]; + struct time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 0668b68..14f6136 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(dev_usage, 8) \ x(log, 9) \ x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(write_buffer_keys, 11) \ + x(datetime, 12) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1376,6 +1377,11 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +} __packed __aligned(8); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index cbad7a9..4b8fba7 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -379,7 +379,7 @@ struct bch_ioctl_disk_resize_journal { struct bch_ioctl_subvolume { __u32 flags; - __s32 dirfd; + __u32 dirfd; __u16 mode; __u16 pad[3]; __u64 dst_ptr; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 3fd1085..1d77aa5 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -9,12 +9,12 @@ #include "bcachefs.h" #include "btree_cache.h" #include "bset.h" -#include "eytzinger.h" #include "trace.h" #include "util.h" #include #include +#include #include #include diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index d7c81be..9b7ea12 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_buf_bytes(b), gfp); + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, c->opts.btree_node_size); + kvfree(c->verify_ondisk); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -648,7 +648,7 @@ out: bch2_btree_keys_init(b); set_btree_node_accessed(b); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); memalloc_nofs_restore(flags); @@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; + trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (path) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + BUG_ON(!path); + + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 1102995..eb92526 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -389,7 +389,8 @@ again: have_child = dropped_children = false; bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -478,7 +479,8 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -931,7 +933,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); bkey_init(&prev.k->k); @@ -963,7 +965,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (b->c.level > target_depth) { bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { struct btree *child; @@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(c, ca) { - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); + kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); ca->buckets_gc = NULL; free_percpu(ca->usage_gc); @@ -1491,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { for_each_member_device(c, ca) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { @@ -1970,7 +1971,7 @@ int bch2_gc_gens(struct bch_fs *c) c->gc_count++; - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); err: for_each_member_device(c, ca) { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index aa9b6cb..61b6093 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, size); + kvfree(p); } static void *btree_bounce_alloc(struct bch_fs *c, size_t size, @@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); /* Make sure we preserve bset journal_seq: */ @@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c, &dst->format, true); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); set_btree_bset_end(dst, dst->set); @@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, out: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); + time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || @@ -1323,7 +1323,7 @@ start: } } - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 5467a86..3aac6ed 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct bkey_s_c k; int ret = 0; - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); @@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, path = &trans->paths[path_idx]; if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out; + goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); @@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } - +out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) @@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && time_after64(now, trans->last_begin_time + 10)) - __bch2_time_stats_update(&btree_trans_stats(trans)->duration, + __time_stats_update(&btree_trans_stats(trans)->duration, trans->last_begin_time, now); if (!trans->restarted && @@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { kfree(s->max_paths_text); - bch2_time_stats_exit(&s->lock_hold_times); + time_stats_exit(&s->lock_hold_times); } if (c->btree_trans_barrier_initialized) @@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { - bch2_time_stats_init(&s->duration); - bch2_time_stats_init(&s->lock_hold_times); + time_stats_init(&s->duration); + time_stats_init(&s->lock_hold_times); mutex_init(&s->lock); } diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 719a94a..3da6556 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bset.h" +#include "btree_cache.h" #include "btree_journal_iter.h" #include "journal_io.h" @@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) iter->pos = bpos_successor(iter->pos); } +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k, ret; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); again: if (iter->at_end) return bkey_s_c_null; @@ -376,17 +407,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b, struct btree_node_iter node_iter, struct bpos pos) { memset(iter, 0, sizeof(*iter)); + iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; @@ -396,15 +428,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter * this version is used by btree_gc before filesystem has gone RW and * multithreaded, so uses the journal_iters list: */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b) { struct btree_node_iter node_iter; bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -415,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c) struct genradix_iter iter; genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); + kvfree(*i); genradix_free(&c->journal_entries); } diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h index 8ca4c10..c9d19da 100644 --- a/libbcachefs/btree_journal_iter.h +++ b/libbcachefs/btree_journal_iter.h @@ -15,6 +15,7 @@ struct journal_iter { */ struct btree_and_journal_iter { + struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -22,6 +23,7 @@ struct btree_and_journal_iter { struct journal_iter journal; struct bpos pos; bool at_end; + bool prefetch; }; struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, @@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); +int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, + struct btree_and_journal_iter *); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 4bd72c8..f2e2c58 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, + __time_stats_update(&btree_trans_stats(trans)->lock_hold_times, path->l[level].lock_taken_time, local_clock()); #endif diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 4a5a644..0d5eecb 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -2,12 +2,12 @@ #ifndef _BCACHEFS_BTREE_TYPES_H #define _BCACHEFS_BTREE_TYPES_H +#include #include #include #include "btree_key_cache_types.h" #include "buckets_types.h" -#include "darray.h" #include "errcode.h" #include "journal_types.h" #include "replicas_types.h" diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index c3ff365..e519311 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 17a5938..030291c 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * bch2_disk_reservation_put(c, &as->disk_res); bch2_btree_reserve_put(as, trans); - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], as->start_time); mutex_lock(&c->btree_interior_update_lock); @@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * continue_at(&as->cl, btree_update_set_nodes_written, as->c->btree_interior_update_worker); - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], + time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], start_time); } @@ -1629,7 +1629,7 @@ out: bch2_trans_verify_locks(trans); - bch2_time_stats_update(&c->times[n2 + time_stats_update(&c->times[n2 ? BCH_TIME_btree_node_split : BCH_TIME_btree_node_compact], start_time); @@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_done(as, trans); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); + time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); out: err: if (new_path) @@ -2484,7 +2484,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h index 9b9433d..5f24887 100644 --- a/libbcachefs/btree_write_buffer_types.h +++ b/libbcachefs/btree_write_buffer_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#include "darray.h" +#include #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 54f7826..7dca10b 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) @@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } if ((c->opts.buckets_nouse && - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { + !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { ret = -BCH_ERR_ENOMEM_buckets_nouse; goto err; } @@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvpfree(buckets_nouse, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); @@ -1407,27 +1406,21 @@ err: void bch2_dev_buckets_free(struct bch_dev *ca) { - unsigned i; - - kvpfree(ca->buckets_nouse, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), - sizeof(struct bucket_gens) + ca->mi.nbuckets); + kvfree(ca->buckets_nouse); + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); if (!ca->usage_base) return -BCH_ERR_ENOMEM_usage_init; - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { ca->usage[i] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[i]) return -BCH_ERR_ENOMEM_usage_init; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 226b39c..4cbda66 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -11,7 +11,6 @@ #include "replicas.h" #include "super.h" #include "super-io.h" -#include "thread_with_file.h" #include #include @@ -20,6 +19,7 @@ #include #include #include +#include #include __must_check @@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) kfree(thr); } -static int bch2_fsck_offline_thread_fn(void *arg) +static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); if (!thr->thr.thr.ret) bch2_fs_stop(c); - - thread_with_stdio_done(&thr->thr); - return 0; } static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) @@ -220,7 +217,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - ret = bch2_run_thread_with_stdio(&thr->thr, + ret = run_thread_with_stdio(&thr->thr, bch2_fsck_thread_exit, bch2_fsck_offline_thread_fn); err: @@ -425,7 +422,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) { struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - bch2_thread_with_file_exit(&ctx->thr); + thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -475,7 +472,7 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ret = bch2_run_thread_with_file(&ctx->thr, + ret = run_thread_with_file(&ctx->thr, &bcachefs_data_ops, bch2_data_thread); if (ret < 0) @@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(void *arg) +static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = thr->c; c->stdio_filter = current; @@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - thread_with_stdio_done(&thr->thr); - up(&c->online_fsck_mutex); bch2_ro_ref_put(c); - return 0; } static long bch2_ioctl_fsck_online(struct bch_fs *c, @@ -840,7 +834,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, goto err; } - ret = bch2_run_thread_with_stdio(&thr->thr, + ret = run_thread_with_stdio(&thr->thr, bch2_fsck_thread_exit, bch2_fsck_online_thread_fn); err: diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 33df8cf..1410365 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return 0; if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_read_init; if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; @@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_initialized(&c->compress_workspace[i->type])) continue; - if (mempool_init_kvpmalloc_pool( + if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) return -BCH_ERR_ENOMEM_compression_workspace_init; } if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvpmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) + mempool_init_kvmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) return -BCH_ERR_ENOMEM_decompression_workspace_init; return 0; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 7bdba85..b1f147e 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_buf_bytes(b)); + kvfree(n_ondisk); percpu_ref_put(&ca->io_ref); } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index ae29ad0..97773cf 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.inode = dir; dirent->k.p.snapshot = snapshot; - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + zero_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; return ret; @@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans, struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + unsigned src_update_flags = 0; + bool delete_src, delete_dst; int ret = 0; - if (src_dir.subvol != dst_dir.subvol) - return -EXDEV; - memset(src_inum, 0, sizeof(*src_inum)); memset(dst_inum, 0, sizeof(*dst_inum)); @@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - src_type = bkey_s_c_to_dirent(old_src).v->d_type; - - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) - return -EOPNOTSUPP; - - /* Lookup dst: */ if (mode == BCH_RENAME) { /* @@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_s_c_to_dirent(old_dst), dst_inum); if (ret) goto out; - - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; - - if (dst_type == DT_SUBVOL) - return -EOPNOTSUPP; } if (mode != BCH_RENAME_EXCHANGE) @@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans, } } + if (new_dst->v.d_type == DT_SUBVOL) + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); + + if ((mode == BCH_RENAME_EXCHANGE) && + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; out_set_src: - /* - * If we're deleting a subvolume, we need to really delete the dirent, - * not just emit a whiteout in the current snapshot: + * If we're deleting a subvolume we need to really delete the dirent, + * not just emit a whiteout in the current snapshot - there can only be + * single dirent that points to a given subvolume. + * + * IOW, we don't maintain multiple versions in different snapshots of + * dirents that point to subvolumes - dirents that point to subvolumes + * are only visible in one particular subvolume so it's not necessary, + * and it would be particularly confusing for fsck to have to deal with. */ - if (src_type == DT_SUBVOL) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter); + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && + new_src->k.p.snapshot != old_src.k->p.snapshot; + + delete_dst = old_dst.k && + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && + new_dst->k.p.snapshot != old_dst.k->p.snapshot; + + if (!delete_src || !bkey_deleted(&new_src->k)) { + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); if (ret) goto out; + } - new_src->k.p = src_iter.pos; - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + if (delete_src) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; } - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; + if (delete_dst) { + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; + } if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; @@ -458,41 +472,29 @@ out: return ret; } -int __bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) +int bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); if (ret) return ret; - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, inum); + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); if (ret > 0) ret = -ENOENT; err: if (ret) bch2_trans_iter_exit(trans, iter); - return ret; } @@ -504,7 +506,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, struct btree_iter iter = { NULL }; int ret = lockrestart_do(trans, - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 21ffeb7..f1dd720 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, +int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index d503af2..b98e2c2 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); + kvfree(buf->data[i]); buf->data[i] = NULL; } } @@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 8c40c20..3fd33b3 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -176,6 +176,8 @@ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index d32c8be..70a1253 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -2,7 +2,7 @@ #include "bcachefs.h" #include "error.h" #include "super.h" -#include "thread_with_file.h" +#include #define FSCK_ERR_RATELIMIT_NR 10 @@ -105,7 +105,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) do { bch2_print(c, " (y,n, or Y,N for all errors of this type) "); - int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); + int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); if (r < 0) return YN_NO; buf[r] = '\0'; diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 66b945b..d8153fe 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -24,12 +24,12 @@ struct { \ (fifo)->mask = (fifo)->size \ ? roundup_pow_of_two((fifo)->size) - 1 \ : 0; \ - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + kvfree((fifo)->data); \ (fifo)->data = NULL; \ } while (0) diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 1c1ea0f..523507e 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + /* Can't move across subvolumes, unless it's a subvolume root: */ + if (src_dir.subvol != dst_dir.subvol && + (!src_inode_u->bi_subvol || + (dst_inum.inum && !dst_inode_u->bi_subvol))) { + ret = -EXDEV; + goto err; + } + + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + + if ((mode == BCH_RENAME_EXCHANGE) && + dst_inode_u->bi_parent_subvol) + dst_inode_u->bi_parent_subvol = src_dir.subvol; + src_inode_u->bi_dir = dst_dir_u->bi_inum; src_inode_u->bi_dir_offset = dst_offset; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 3a4c24c..3dc8630 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (IS_ERR(victim)) return PTR_ERR(victim); + dir = d_inode(path.dentry); if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; @@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -ENOENT; goto err; } - dir = d_inode(path.dentry); ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); d_delete(victim); } - inode_unlock(dir); err: + inode_unlock(dir); dput(victim); path_put(&path); return ret; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index ec419b8..77ea610 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked inode_u; - struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; + subvol_inum inum = inode_inum(inode); + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + if (unlikely(old != inode)) { + discard_new_inode(&inode->v); + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + return inode; +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_info *inode = + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, + to_bch_ei(new_inode(c->vfs_sb))); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + if (ret && inode) + discard_new_inode(&inode->v); + if (ret) + return ERR_PTR(ret); } - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + return inode; +} - unlock_new_inode(&inode->v); +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + return &inode->v; - return &inode->v; + struct btree_trans *trans = bch2_trans_get(c); + + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (!ret) { + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); + } + bch2_trans_put(trans); + + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; @@ -293,7 +336,6 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -304,36 +346,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -352,23 +365,78 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + + int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + if (ret) + return ERR_PTR(ret); + + struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (bch2_err_matches(ret, ENOENT)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s points to missing inode", buf.buf); + printbuf_exit(&buf); + } + if (ret) + goto err; + + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); +out: + bch2_trans_iter_exit(trans, &dirent_iter); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, @@ -1371,6 +1439,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); if (BCH_SUBVOLUME_SNAP(subvol)) @@ -1571,7 +1640,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1582,10 +1650,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; @@ -1881,6 +1946,7 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + sb->s_uuid = c->sb.user_uuid; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 7e82c7b..e4a8a14 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -5,7 +5,6 @@ #include "btree_cache.h" #include "btree_update.h" #include "buckets.h" -#include "darray.h" #include "dirent.h" #include "error.h" #include "fs-common.h" @@ -18,6 +17,7 @@ #include "xattr.h" #include +#include #include /* struct qstr */ /* @@ -100,8 +100,8 @@ err: } static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) + struct bch_inode_unpacked *inode, + u32 *snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -123,17 +123,15 @@ err: return ret; } -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -144,34 +142,6 @@ static int __lookup_dirent(struct btree_trans *trans, return 0; } -static int __write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static int fsck_write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __write_inode(trans, inode, snapshot)); - bch_err_fn(trans->c, ret); - return ret; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -224,15 +194,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot); + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; @@ -250,7 +221,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * shouldn't exist here: */ ret = lookup_inode(trans, inum, lostfound, &snapshot); - bch_err_msg(c, ret, "looking up lost+found"); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; create_lostfound: @@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans, if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = __write_inode(trans, &lostfound, U32_MAX); + ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } @@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; - return __write_inode(trans, inode, inode_snapshot); + return __bch2_fsck_write_inode(trans, inode, inode_snapshot); } static int remove_backpointer(struct btree_trans *trans, @@ -722,7 +694,7 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_snapshot(trans, desc, hash_info, + bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, @@ -861,7 +833,8 @@ static int check_inode(struct btree_trans *trans, u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -950,8 +923,33 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_subvol) { + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, inode_bi_subvol_missing, + "inode %llu:%u bi_subvol points to missing subvolume %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), + k.k->p.snapshot), + c, inode_bi_subvol_wrong, + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, + le64_to_cpu(s.inode), + le32_to_cpu(s.snapshot))) { + u.bi_subvol = 0; + u.bi_parent_subvol = 0; + do_update = true; + } + } + if (do_update) { - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -1032,7 +1030,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1481,7 +1479,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1491,16 +1489,15 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - u32 target_snapshot) +static int check_inode_backpointer(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) { struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; struct btree_iter bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; if (!target->bi_dir && @@ -1508,7 +1505,7 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } @@ -1548,7 +1545,7 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_nlink++; target->bi_flags &= ~BCH_INODE_unlinked; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } @@ -1566,11 +1563,34 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = check_inode_backpointer(trans, iter, d, target, target_snapshot); + if (ret) + goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), c, dirent_d_type_wrong, @@ -1614,15 +1634,65 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } -out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } +static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; + int ret = 0; + + ret = subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, dirent_to_missing_subvol, + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) + return __remove_dirent(trans, d.k->p); + + ret = lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + return ret; +fsck_err: + return ret; +} + static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bch_hash_info *hash_info, @@ -1707,50 +1777,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - struct bch_inode_unpacked subvol_root; - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; - - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) { - ret = __remove_dirent(trans, d.k->p); - goto err; - } - - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - ret = -EINVAL; - goto err; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __write_inode(trans, &subvol_root, target_snapshot); - if (ret) - goto err; - } - - ret = check_dirent_target(trans, iter, d, &subvol_root, - target_snapshot); + ret = check_subvol_dirent(trans, iter, d); if (ret) goto err; } else { @@ -1776,12 +1803,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - } - - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) - i->count++; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + } out: err: fsck_err: @@ -1919,7 +1945,7 @@ static int check_root_trans(struct btree_trans *trans) 0, NULL); root_inode.bi_inum = inum; - ret = __write_inode(trans, &root_inode, snapshot); + ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); bch_err_msg(c, ret, "writing root inode"); } err: @@ -2291,7 +2317,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __write_inode(trans, &u, k.k->p.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); } fsck_err: return ret; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 086f009..dbe37cc 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -static int bch2_inode_peek_nowarn(struct btree_trans *trans, +int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) @@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } +int __bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +int bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fsck_write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); + return ret; +} + struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index b63f312..9a9353c 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); +int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); @@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans, return bch2_inode_write_flags(trans, iter, inode, 0); } +int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); +int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); + void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); void bch2_inode_init_late(struct bch_inode_unpacked *, u64, diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 3c574d8..dce136c 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop) container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.op.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); promote_free(c, op); } @@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) static void bch2_rbio_done(struct bch_read_bio *rbio) { if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); bio_endio(&rbio->bio); } diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index ef3a53f..13b3514 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) bch2_congested_acct(ca, io_latency, now, rw); - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); + __time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif @@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl) EBUG_ON(op->open_buckets.nr); - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_MOVE)) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index bc89077..214c803 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = { NULL }; +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); @@ -54,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "%li jiffies", buf->expires - jiffies); prt_newline(out); + if (buf->write_done) + prt_printf(out, "write done\n"); + else if (buf->write_allocated) + prt_printf(out, "write allocated\n"); + else if (buf->write_started) + prt_printf(out, "write started\n"); + printbuf_indent_sub(out, 2); } @@ -66,26 +93,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); -} - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); } static inline struct journal_buf * @@ -174,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } +void bch2_journal_do_writes(struct journal *j) +{ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + idx; + + if (w->write_started && !w->write_allocated) + break; + if (w->write_started) + continue; + + if (!journal_state_count(j->reservations, idx)) { + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + + break; + } +} + /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +void bch2_journal_buf_put_final(struct journal *j, u64 seq) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + bch2_journal_do_writes(j); } /* @@ -380,11 +407,14 @@ static int journal_entry_open(struct journal *j) BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; buf->need_flush_to_write_buffer = true; + buf->write_started = false; + buf->write_allocated = false; + buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -418,9 +448,10 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - mod_delayed_work(c->io_complete_wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); + if (nr_unwritten_journal_entries(j) == 1) + mod_delayed_work(j->wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) @@ -445,20 +476,16 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - long delta; spin_lock(&j->lock); - if (!__journal_entry_is_open(j->reservations)) - goto unlock; - - delta = journal_cur_buf(j)->expires - jiffies; + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; - if (delta > 0) - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -unlock: + if (delta > 0) + mod_delayed_work(j->wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } spin_unlock(&j->lock); } @@ -473,33 +500,32 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + ret = JOURNAL_ERR_journal_full; + can_discard = j->can_discard; + goto out; + } - spin_lock(&j->lock); + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; - /* check once more in case somebody else shut things down... */ - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); + if (bch2_journal_error(j)) return -BCH_ERR_erofs_journal_err; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { + ret = JOURNAL_ERR_max_in_flight; + goto out; } + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { - spin_unlock(&j->lock); - return 0; - } - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - /* - * Don't want to close current journal entry, just need to - * invoke reclaim: - */ - ret = JOURNAL_ERR_journal_full; + ret = 0; goto unlock; } @@ -515,30 +541,30 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j); - - if (ret == JOURNAL_ERR_max_in_flight) { - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true); - if (trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_journal_bufs_to_text(&buf, j); - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - } - count_event(c, journal_entry_full); - } + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); - - if (!ret) +out: + if (ret == JOURNAL_ERR_retry) goto retry; + if (!ret) + return 0; + if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; + if (ret == JOURNAL_ERR_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -727,7 +753,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); if (!ret) - bch2_time_stats_update(j->flush_seq_time, start_time); + time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } @@ -1157,7 +1183,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - unsigned ptr; u64 last_seq = cur_seq, nr, seq; genradix_for_each_reverse(&c->journal_entries, iter, _i) { @@ -1211,8 +1236,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) p = journal_seq_pin(j, seq); p->devs.nr = 0; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) + bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } @@ -1240,13 +1265,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) void bch2_dev_journal_exit(struct bch_dev *ca) { - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + kfree(ja->bio[i]); + ja->bio[i] = NULL; + } - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; + kfree(ja->buckets); + kfree(ja->bucket_seq); + ja->buckets = NULL; + ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) @@ -1256,14 +1285,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); - unsigned i, nr_bvecs; ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -1273,13 +1301,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -BCH_ERR_ENOMEM_dev_journal_init; - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!ca->journal.bio) - return -BCH_ERR_ENOMEM_dev_journal_init; + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); + if (!ja->bio[i]) + return -BCH_ERR_ENOMEM_dev_journal_init; - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) @@ -1287,14 +1320,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned j, dst = 0; + unsigned dst = 0; - for (i = 0; i < nr; i++) - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + for (unsigned i = 0; i < nr; i++) + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { - for (i = 0; i < ja->nr; i++) + for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } @@ -1303,19 +1336,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned i; + if (j->wq) + destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); - for (i = 0; i < ARRAY_SIZE(j->buf); i++) - kvpfree(j->buf[i].data, j->buf[i].buf_size); + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + kvfree(j->buf[i].data); free_fifo(&j->pin); } int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned i; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); @@ -1336,14 +1369,20 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; + j->buf[i].idx = i; } j->pin.front = j->pin.back = 1; + + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) + return -BCH_ERR_ENOMEM_fs_other_alloc; return 0; } @@ -1455,7 +1494,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1473,7 +1511,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) list_for_each_entry(pin, &pin_list->list[i], list) { prt_printf(out, "\t%px %ps", pin, pin->flush); prt_newline(out); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 4544ce2..7c7528f 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u } bool bch2_journal_entry_close(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64, bool); +void bch2_journal_do_writes(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { @@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); } } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index bfd6585..e31e215 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -17,6 +17,38 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + darray_for_each(j->ptrs, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + u64 offset; + + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); + + if (i != j->ptrs.data) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + i->dev, i->bucket, i->bucket_offset, i->sector); + } +} + +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + + bch2_journal_ptrs_to_text(out, c, j); + + struct jset_entry *entry; + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); + break; + } +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -52,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c, BUG_ON(*p != i); *p = NULL; - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + kvfree(i); } static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) @@ -84,9 +115,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; - struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; /* Is this entry older than the range we need? */ @@ -131,72 +162,61 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes)) { - i = dup; - goto found; - } + bool identical = bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes); + bool not_identical = !identical && + entry_ptr.csum_good && + dup->csum_good; + + bool same_device = false; + darray_for_each(dup->ptrs, ptr) + if (ptr->dev == ca->dev_idx) + same_device = true; + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) + goto out; - if (!entry_ptr.csum_good) { - i = dup; - goto found; - } + bch2_journal_replay_to_text(&buf, c, dup); + + fsck_err_on(same_device, + c, journal_entry_dup_same_device, + "duplicate journal entry on same device\n %s", + buf.buf); - if (!dup->csum_good) + fsck_err_on(not_identical, + c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries\n %s", + buf.buf); + + if (entry_ptr.csum_good && !identical) goto replace; - fsck_err(c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - i = dup; - goto found; + goto out; } replace: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; - i->nr_ptrs = 0; + darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; - } - /* The first ptr should represent the jset we kept: */ - memcpy(i->ptrs + i->nr_ptrs, - dup->ptrs, - sizeof(dup->ptrs[0]) * dup->nr_ptrs); - i->nr_ptrs += dup->nr_ptrs; + darray_for_each(dup->ptrs, ptr) + darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); + } else { + darray_push(&i->ptrs, entry_ptr); } *_i = i; - return 0; -found: - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { - if (ptr->dev == ca->dev_idx) { - bch_err(c, "duplicate journal entry %llu on same device", - le64_to_cpu(i->j.seq)); - goto out; - } - } - - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - goto out; - } - - i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: + printbuf_exit(&buf); return ret; } @@ -741,6 +761,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; + int ret = 0; + + if (journal_entry_err_on(vstruct_bytes(entry) < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } +fsck_err: + return ret; +} + +static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -913,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); + n = kvmalloc(new_size, GFP_KERNEL); if (!n) return -BCH_ERR_ENOMEM_journal_read_buf_realloc; - kvpfree(b->data, b->size); + kvfree(b->data); b->data = n; b->size = new_size; return 0; @@ -1102,16 +1153,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!r) continue; - for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + darray_for_each(r->ptrs, i) + if (i->dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, i->sector) + vstruct_sectors(&r->j, c->block_bits); - ja->cur_idx = r->ptrs[i].bucket; + ja->cur_idx = i->bucket; ja->sectors_free = ca->mi.bucket_size - wrote; goto found; } - } } found: mutex_unlock(&jlist->lock); @@ -1144,7 +1194,7 @@ found: ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvpfree(buf.data, buf.size); + kvfree(buf.data); percpu_ref_put(&ca->io_ref); closure_return(cl); return; @@ -1155,27 +1205,6 @@ err: goto out; } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - unsigned i; - - for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); - u64 offset; - - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); - - if (i) - prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - j->ptrs[i].dev, - j->ptrs[i].bucket, - j->ptrs[i].bucket_offset, - j->ptrs[i].sector); - } -} - int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, @@ -1353,32 +1382,31 @@ int bch2_journal_read(struct bch_fs *c, .e.data_type = BCH_DATA_journal, .e.nr_required = 1, }; - unsigned ptr; i = *_i; if (!i || i->ignore) continue; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - if (!i->ptrs[ptr].csum_good) - bch_err_dev_offset(ca, i->ptrs[ptr].sector, + if (!ptr->csum_good) + bch_err_dev_offset(ca, ptr->sector, "invalid journal checksum, seq %llu%s", le64_to_cpu(i->j.seq), i->csum_good ? " (had good copy on another device)" : ""); } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs[0].dev), + bch_dev_bkey_exists(c, i->ptrs.data[0].dev), &i->j, - i->ptrs[0].sector, + i->ptrs.data[0].sector, READ); if (ret) goto err; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + darray_for_each(i->ptrs, ptr) + replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; bch2_replicas_entry_sort(&replicas.e); @@ -1545,7 +1573,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1556,7 +1584,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) swap(buf->buf_size, new_size); spin_unlock(&j->lock); - kvpfree(new_buf, new_size); + kvfree(new_buf); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) @@ -1566,17 +1594,17 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) static CLOSURE_CALLBACK(journal_write_done) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq; + u64 v, seq = le64_to_cpu(w->data->seq); int err = 0; - bch2_time_stats_update(!JSET_NO_FLUSH(w->data) - ? j->flush_write_time - : j->noflush_write_time, j->write_start_time); + time_stats_update(!JSET_NO_FLUSH(w->data) + ? j->flush_write_time + : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); @@ -1591,63 +1619,68 @@ static CLOSURE_CALLBACK(journal_write_done) if (err) bch2_fatal_error(c); - spin_lock(&j->lock); - seq = le64_to_cpu(w->data->seq); + closure_debug_destroy(cl); + spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + w->write_done = true; + + bool completed = false; - if (!err) { - if (!JSET_NO_FLUSH(w->data)) { + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + w = j->buf + (seq & JOURNAL_BUF_MASK); + if (!w->write_done) + break; + + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; bch2_do_discards(c); closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); } - } else if (!j->err_seq || seq < j->err_seq) - j->err_seq = seq; - j->seq_ondisk = seq; + j->seq_ondisk = seq; - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - v = atomic64_read(&j->reservations.counter); - do { - old.v = new.v = v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + completed = true; + } - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); + if (completed) { + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - closure_wake_up(&w->wait); - journal_wake(j); + closure_wake_up(&w->wait); + journal_wake(j); + } - if (!journal_state_count(new, new.unwritten_idx) && - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { - spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1657,46 +1690,46 @@ static CLOSURE_CALLBACK(journal_write_done) * previous entries still in flight - the current journal entry * might want to be written now: */ - - spin_unlock(&j->lock); - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); - } else { - spin_unlock(&j->lock); + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) { - struct bch_dev *ca = bio->bi_private; + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); + struct bch_dev *ca = jbio->ca; struct journal *j = &ca->fs->journal; - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; + struct journal_buf *w = j->buf + jbio->buf_idx; if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { + unsigned long flags; + spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } - closure_put(&j->io); + closure_put(&w->io); percpu_ref_put(&ca->io_ref); } static CLOSURE_CALLBACK(do_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct journal_device *ja = &ca->journal; + if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); @@ -1706,7 +1739,7 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - bio = ca->journal.bio; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1725,11 +1758,10 @@ static CLOSURE_CALLBACK(do_journal_write) trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = - le64_to_cpu(w->data->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) @@ -1802,6 +1834,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + struct jset_entry_datetime *d = + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); + d->entry.type = BCH_JSET_ENTRY_datetime; + d->seconds = cpu_to_le64(ktime_get_real_seconds()); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1901,16 +1938,16 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * CLOSURE_CALLBACK(bch2_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; - struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(w->write_allocated); j->write_start_time = local_clock(); @@ -1954,12 +1991,14 @@ CLOSURE_CALLBACK(bch2_journal_write) * bch2_journal_space_available(): */ w->sectors = 0; + w->write_allocated = true; /* * journal entry has been compacted and allocated, recalculate space * available: */ bch2_journal_space_available(j); + bch2_journal_do_writes(j); spin_unlock(&j->lock); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); @@ -1983,25 +2022,29 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (!JSET_NO_FLUSH(w->data)) + closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq)); + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); - bio = ca->journal.bio; + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_PREFLUSH); + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); } } - continue_at(cl, do_journal_write, c->io_complete_wq); + continue_at(cl, do_journal_write, j->wq); return; no_io: - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); return; err: bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index c035e7c..f18b900 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -2,19 +2,22 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H +#include + +struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration */ struct journal_replay { - struct journal_ptr { - bool csum_good; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; - } ptrs[BCH_REPLICAS_MAX]; - unsigned nr_ptrs; + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; bool ignore; @@ -62,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); +static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 820d25e..f29fd39 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j) ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], - &j->low_on_space_start, low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j, return; } - reclaim = __journal_pin_drop(j, dst); + bool reclaim = __journal_pin_drop(j, dst); bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); BUG_ON(seq < journal_last_seq(j)); - reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + + spin_unlock(&j->lock); } /** diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index ae4fb8c..156691c 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -2,8 +2,8 @@ #include "bcachefs.h" #include "journal_sb.h" -#include "darray.h" +#include #include /* BCH_SB_FIELD_journal: */ diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 0200e29..024c9b1 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -2,10 +2,11 @@ #include "bcachefs.h" #include "btree_iter.h" -#include "eytzinger.h" #include "journal_seq_blacklist.h" #include "super-io.h" +#include + /* * journal_seq_blacklist machinery: * @@ -119,8 +120,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 38817c7..011f7a0 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -18,6 +18,7 @@ * the journal that are being staged or in flight. */ struct journal_buf { + struct closure io; struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); @@ -33,10 +34,14 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; - bool noflush; /* write has already been kicked off, and was noflush */ - bool must_flush; /* something wants a flush */ - bool separate_flush; - bool need_flush_to_write_buffer; + bool noflush:1; /* write has already been kicked off, and was noflush */ + bool must_flush:1; /* something wants a flush */ + bool separate_flush:1; + bool need_flush_to_write_buffer:1; + bool write_started:1; + bool write_allocated:1; + bool write_done:1; + u8 idx; }; /* @@ -134,6 +139,7 @@ enum journal_flags { /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ + x(retry) \ x(blocked) \ x(max_in_flight) \ x(journal_full) \ @@ -149,6 +155,13 @@ enum journal_errors { typedef DARRAY(u64) darray_u64; +struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; + + struct bio bio; +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ @@ -203,8 +216,8 @@ struct journal { wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure io; struct delayed_work write_work; + struct workqueue_struct *wq; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -274,14 +287,9 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - u64 low_on_space_start; - u64 low_on_pin_start; - u64 max_in_flight_start; - u64 write_buffer_full_start; - - struct bch2_time_stats *flush_write_time; - struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *flush_seq_time; + struct time_stats *flush_write_time; + struct time_stats *noflush_write_time; + struct time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map res_map; @@ -313,7 +321,7 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio; + struct journal_bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 5623cee..69098ee 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -EINVAL; + return -BCH_ERR_remove_would_lose_data; return 0; } @@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -EINVAL; + return -BCH_ERR_remove_with_metadata_missing_unimplemented; trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -132,10 +132,8 @@ retry: ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); - if (ret) { - bch_err(c, "Cannot drop device without losing data"); + if (ret) break; - } ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index 3c21981..181efa4 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, u64 start_time = local_clock(); __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); } } diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index cc2672c..75fdce3 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -6,12 +6,15 @@ #include "replicas.h" #include "super-io.h" +#include + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 654a4b2..983cce7 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -3,9 +3,10 @@ #define _BCACHEFS_REPLICAS_H #include "bkey.h" -#include "eytzinger.h" #include "replicas_types.h" +#include + void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry_v1 *); diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c index b6bf0eb..5980ba2 100644 --- a/libbcachefs/sb-clean.c +++ b/libbcachefs/sb-clean.c @@ -171,22 +171,6 @@ fsck_err: return ERR_PTR(ret); } -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index 441dcb1..626eaae 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -6,12 +6,13 @@ */ #include "bcachefs.h" -#include "darray.h" #include "recovery.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" +#include + #define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) /* diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h index c08aacd..63f18c7 100644 --- a/libbcachefs/sb-errors_types.h +++ b/libbcachefs/sb-errors_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SB_ERRORS_TYPES_H #define _BCACHEFS_SB_ERRORS_TYPES_H -#include "darray.h" +#include #define BCH_SB_ERRS() \ x(clean_but_journal_not_empty, 0) \ @@ -250,7 +250,10 @@ x(hash_table_key_duplicate, 242) \ x(hash_table_key_wrong_offset, 243) \ x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index be0a941..e4d4d84 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H -#include "darray.h" +#include extern char * const bch2_member_error_strs[]; diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 89fdb7c..3976f80 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -194,6 +189,19 @@ bch2_hash_lookup(struct btree_trans *trans, return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; } +static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, @@ -251,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_snapshot(struct btree_trans *trans, +int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, @@ -320,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans, struct bkey_i *insert, bch_str_hash_flags_t str_hash_flags) { - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - insert->k.p.inode = inum.inum; - return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_set_in_snapshot(trans, desc, info, inum, + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 7c67c28..e7ee52c 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + struct bch_inode_unpacked inode; + struct btree_iter inode_iter = {}; + ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + 0); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode_iter.k.p.snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_tree; @@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans, } } +err: fsck_err: return ret; } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index a6f56f6..3ca1d18 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -2,7 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H -#include "darray.h" #include "subvolume_types.h" enum bkey_invalid_flags; diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index ae644ad..40f16e3 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" +#include typedef DARRAY(u32) snapshot_id_list; diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 95e80e0..f376209 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -3,12 +3,12 @@ #define _BCACHEFS_SUPER_IO_H #include "extents.h" -#include "eytzinger.h" #include "super_types.h" #include "super.h" #include "sb-members.h" #include +#include static inline bool bch2_version_compatible(u16 version) { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index da8697c..68704a8 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -67,6 +67,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...) if (likely(!stdio)) { vprintk(fmt, args); } else { - unsigned long flags; - if (fmt[0] == KERN_SOH[0]) fmt += 2; - spin_lock_irqsave(&stdio->output_lock, flags); - prt_vprintf(&stdio->output_buf, fmt, args); - spin_unlock_irqrestore(&stdio->output_lock, flags); - - wake_up(&stdio->output_wait); + stdio_redirect_vprintf(stdio, true, fmt, args); } va_end(args); } @@ -520,7 +515,7 @@ static void __bch2_fs_free(struct bch_fs *c) unsigned i; for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_exit(&c->times[i]); + time_stats_exit(&c->times[i]); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); @@ -576,7 +571,7 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); - kvpfree(c, sizeof(*c)); + kvfree(c); module_put(THIS_MODULE); } @@ -715,7 +710,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; @@ -753,7 +748,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); + time_stats_init(&c->times[i]); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); @@ -882,8 +877,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -1124,7 +1119,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_newline(&buf); prt_bdevname(&buf, fs->bdev); - prt_str(&buf, "believes seq of "); + prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); @@ -1168,8 +1163,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); - bch2_time_stats_exit(&ca->io_latency[WRITE]); - bch2_time_stats_exit(&ca->io_latency[READ]); + time_stats_quantiles_exit(&ca->io_latency[WRITE]); + time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); @@ -1260,8 +1255,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); - bch2_time_stats_init(&ca->io_latency[READ]); - bch2_time_stats_init(&ca->io_latency[WRITE]); + time_stats_quantiles_init(&ca->io_latency[READ]); + time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index cee80c4..c86a93a 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -930,10 +930,10 @@ SHOW(bch2_dev) sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ]); + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c deleted file mode 100644 index b1c867a..0000000 --- a/libbcachefs/thread_with_file.c +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "printbuf.h" -#include "thread_with_file.h" - -#include -#include -#include -#include -#include - -void bch2_thread_with_file_exit(struct thread_with_file *thr) -{ - if (thr->task) { - kthread_stop(thr->task); - put_task_struct(thr->task); - } -} - -int bch2_run_thread_with_file(struct thread_with_file *thr, - const struct file_operations *fops, - int (*fn)(void *)) -{ - struct file *file = NULL; - int ret, fd = -1; - unsigned fd_flags = O_CLOEXEC; - - if (fops->read && fops->write) - fd_flags |= O_RDWR; - else if (fops->read) - fd_flags |= O_RDONLY; - else if (fops->write) - fd_flags |= O_WRONLY; - - char name[TASK_COMM_LEN]; - get_task_comm(name, current); - - thr->ret = 0; - thr->task = kthread_create(fn, thr, "%s", name); - ret = PTR_ERR_OR_ZERO(thr->task); - if (ret) - return ret; - - ret = get_unused_fd_flags(fd_flags); - if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile(name, fops, thr, fd_flags); - ret = PTR_ERR_OR_ZERO(file); - if (ret) - goto err; - - fd_install(fd, file); - get_task_struct(thr->task); - wake_up_process(thr->task); - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (thr->task) - kthread_stop(thr->task); - return ret; -} - -static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) -{ - return thr->stdio.output_buf.pos || - thr->output2.nr || - thr->thr.done; -} - -static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - size_t copied = 0, b; - int ret = 0; - - if ((file->f_flags & O_NONBLOCK) && - !thread_with_stdio_has_output(thr)) - return -EAGAIN; - - ret = wait_event_interruptible(thr->stdio.output_wait, - thread_with_stdio_has_output(thr)); - if (ret) - return ret; - - if (thr->thr.done) - return 0; - - while (len) { - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); - if (ret) - break; - - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); - - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); - memmove(thr->stdio.output_buf.buf, - thr->stdio.output_buf.buf + b, - thr->stdio.output_buf.pos - b); - - thr->output2.nr += b; - thr->stdio.output_buf.pos -= b; - spin_unlock_irq(&thr->stdio.output_lock); - - b = min(len, thr->output2.nr); - if (!b) - break; - - b -= copy_to_user(buf, thr->output2.data, b); - if (!b) { - ret = -EFAULT; - break; - } - - copied += b; - buf += b; - len -= b; - - memmove(thr->output2.data, - thr->output2.data + b, - thr->output2.nr - b); - thr->output2.nr -= b; - } - - return copied ?: ret; -} - -static int thread_with_stdio_release(struct inode *inode, struct file *file) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - bch2_thread_with_file_exit(&thr->thr); - printbuf_exit(&thr->stdio.input_buf); - printbuf_exit(&thr->stdio.output_buf); - darray_exit(&thr->output2); - thr->exit(thr); - return 0; -} - -#define WRITE_BUFFER 4096 - -static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) -{ - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; -} - -static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.input_buf; - size_t copied = 0; - ssize_t ret = 0; - - while (len) { - if (thr->thr.done) { - ret = -EPIPE; - break; - } - - size_t b = len - fault_in_readable(ubuf, len); - if (!b) { - ret = -EFAULT; - break; - } - - spin_lock(&thr->stdio.input_lock); - if (buf->pos < WRITE_BUFFER) - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); - b = min(len, printbuf_remaining_size(buf)); - - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { - ubuf += b; - len -= b; - copied += b; - buf->pos += b; - } - spin_unlock(&thr->stdio.input_lock); - - if (b) { - wake_up(&thr->stdio.input_wait); - } else { - if ((file->f_flags & O_NONBLOCK)) { - ret = -EAGAIN; - break; - } - - ret = wait_event_interruptible(thr->stdio.input_wait, - thread_with_stdio_has_input_space(thr)); - if (ret) - break; - } - } - - return copied ?: ret; -} - -static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output_wait, wait); - poll_wait(file, &thr->stdio.input_wait, wait); - - __poll_t mask = 0; - - if (thread_with_stdio_has_output(thr)) - mask |= EPOLLIN; - if (thread_with_stdio_has_input_space(thr)) - mask |= EPOLLOUT; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static const struct file_operations thread_with_stdio_fops = { - .release = thread_with_stdio_release, - .read = thread_with_stdio_read, - .write = thread_with_stdio_write, - .poll = thread_with_stdio_poll, - .llseek = no_llseek, -}; - -int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)) -{ - thr->stdio.input_buf = PRINTBUF; - thr->stdio.input_buf.atomic++; - spin_lock_init(&thr->stdio.input_lock); - init_waitqueue_head(&thr->stdio.input_wait); - - thr->stdio.output_buf = PRINTBUF; - thr->stdio.output_buf.atomic++; - spin_lock_init(&thr->stdio.output_lock); - init_waitqueue_head(&thr->stdio.output_wait); - - darray_init(&thr->output2); - thr->exit = exit; - - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); -} - -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) -{ - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); - - if (stdio->done) - return -1; - - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); - return ret; -} - -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) -{ - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); - - if (stdio->done) - return -1; - - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - char *n = memchr(stdio->input_buf.buf, '\n', ret); - if (n) - ret = min(ret, n + 1 - stdio->input_buf.buf); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h deleted file mode 100644 index 05879c5..0000000 --- a/libbcachefs/thread_with_file.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_H -#define _BCACHEFS_THREAD_WITH_FILE_H - -#include "thread_with_file_types.h" - -struct task_struct; - -struct thread_with_file { - struct task_struct *task; - int ret; - bool done; -}; - -void bch2_thread_with_file_exit(struct thread_with_file *); -int bch2_run_thread_with_file(struct thread_with_file *, - const struct file_operations *, - int (*fn)(void *)); - -struct thread_with_stdio { - struct thread_with_file thr; - struct stdio_redirect stdio; - DARRAY(char) output2; - void (*exit)(struct thread_with_stdio *); -}; - -static inline void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input_wait); - wake_up(&thr->stdio.output_wait); -} - -int bch2_run_thread_with_stdio(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)); -int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); -int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); - -#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h deleted file mode 100644 index 90b5e64..0000000 --- a/libbcachefs/thread_with_file_types.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H -#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H - -struct stdio_redirect { - spinlock_t output_lock; - wait_queue_head_t output_wait; - struct printbuf output_buf; - - spinlock_t input_lock; - wait_queue_head_t input_wait; - struct printbuf input_buf; - bool done; -}; - -#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 56b815f..5397350 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -22,9 +23,8 @@ #include #include #include +#include -#include "eytzinger.h" -#include "mean_and_variance.h" #include "util.h" static const char si_units[] = "?kMGTPEZY"; @@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) } #endif -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - void bch2_pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); @@ -370,120 +344,6 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns) prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -/* time stats: */ - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - - if (time_after64(end, start)) { - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - bch2_quantiles_update(&stats->quantiles, duration); - } - - if (time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - stats->last_event = end; - } -} - -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - for (struct bch2_time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized time_stats"); - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - bch2_time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct bch2_time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct bch2_time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - bch2_time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); @@ -503,19 +363,18 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 #define TABSTOP_SIZE 12 -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) { - const struct time_unit *u; + struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; - int i; + u64 f_stddev = 0, d_stddev = 0; if (stats->buffer) { int cpu; spin_lock_irq(&stats->lock); for_each_possible_cpu(cpu) - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); spin_unlock_irq(&stats->lock); } @@ -570,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -593,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); printbuf_tabstops_reset(out); - i = eytzinger0_first(NR_QUANTILES); - u = pick_time_units(stats->quantiles.entries[i].m); - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - q = max(stats->quantiles.entries[i].m, last_q); - prt_printf(out, "%llu ", - div_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; + if (quantiles) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + pick_time_units(quantiles->entries[i].m); + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } } } -#else -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} -#endif - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - free_percpu(stats->buffer); -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} /* ratelimit: */ @@ -863,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - -static void mempool_free_vp(void *element, void *pool_data) -{ - size_t size = (size_t) pool_data; - - vpfree(element, size); -} - -static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - - return vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return size < PAGE_SIZE - ? mempool_init_kmalloc_pool(pool, min_nr, size) - : mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} - #if 0 void eytzinger1_test(void) { diff --git a/libbcachefs/util.h b/libbcachefs/util.h index b414736..1b3aced 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -5,22 +5,21 @@ #include #include #include +#include #include #include #include -#include #include #include #include #include #include +#include #include +#include #include #include - -#include "mean_and_variance.h" - -#include "darray.h" +#include struct closure; @@ -53,38 +52,6 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void vpfree(void *p, size_t size) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - free_pages((unsigned long) p, get_order(size)); -} - -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -{ - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); -} - -static inline void kvpfree(void *p, size_t size) -{ - if (size < PAGE_SIZE) - kfree(p); - else - vpfree(p, size); -} - -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -{ - return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); - #define HEAP(type) \ struct { \ size_t size, used; \ @@ -97,13 +64,13 @@ struct { \ ({ \ (heap)->used = 0; \ (heap)->size = (_size); \ - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ }) #define free_heap(heap) \ do { \ - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + kvfree((heap)->data); \ (heap)->data = NULL; \ } while (0) @@ -361,83 +328,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct bch2_quantiles { - struct bch2_quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct bch2_time_stat_buffer { - unsigned nr; - struct bch2_time_stat_buffer_entry { - u64 start; - u64 end; - } entries[32]; -}; - -struct bch2_time_stats { - spinlock_t lock; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - struct bch2_quantiles quantiles; - - struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance freq_stats; - struct mean_and_variance_weighted freq_stats_weighted; - struct bch2_time_stat_buffer __percpu *buffer; -}; - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - if (v != !!*start) { - if (!v) { - bch2_time_stats_update(stats, *start); - *start = 0; - } else { - *start = local_clock() ?: 1; - return true; - } - } - - return false; -} -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - bool ret = v && !*start; - *start = v; - return ret; -} -#endif - -void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); - -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); +void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); #define ewma_add(ewma, val, weight) \ ({ \ @@ -738,34 +629,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - -/* just the memmove, doesn't update @_nr */ -#define __array_insert_item(_array, _nr, _pos) \ - memmove(&(_array)[(_pos) + 1], \ - &(_array)[(_pos)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))) - -#define array_insert_item(_array, _nr, _pos, _new_item) \ -do { \ - __array_insert_item(_array, _nr, _pos); \ - (_nr)++; \ - (_array)[(_pos)] = (_new_item); \ -} while (0) - -#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -do { \ - (_nr) -= (_nr_to_remove); \ - memmove(&(_array)[(_pos)], \ - &(_array)[(_pos) + (_nr_to_remove)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ -} while (0) - -#define array_remove_item(_array, _nr, _pos) \ - array_remove_items(_array, _nr, _pos, 1) - static inline void __move_gap(void *array, size_t element_size, size_t nr, size_t size, size_t old_gap, size_t new_gap) diff --git a/libbcachefs/darray.c b/linux/darray.c similarity index 68% rename from libbcachefs/darray.c rename to linux/darray.c index ac35b8b..80e7795 100644 --- a/libbcachefs/darray.c +++ b/linux/darray.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2022-2024 Kent Overstreet + */ +#include #include #include -#include "darray.h" -int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); diff --git a/libbcachefs/mean_and_variance.c b/linux/mean_and_variance.c similarity index 93% rename from libbcachefs/mean_and_variance.c rename to linux/mean_and_variance.c index bf0ef66..b93d150 100644 --- a/libbcachefs/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -40,10 +40,9 @@ #include #include #include +#include #include -#include "mean_and_variance.h" - u128_u u128_div(u128_u n, u64 d) { u128_u r; @@ -107,10 +106,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 x, bool initted, u8 weight) { // previous weighted variance. - u8 w = s->weight; + u8 w = weight; u64 var_w0 = s->variance; // new value weighted. s64 x_w = x << w; @@ -119,14 +119,13 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 // new mean weighted. s64 u_w1 = s->mean + diff; - if (!s->init) { + if (!initted) { s->mean = x_w; s->variance = 0; } else { s->mean = u_w1; s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); @@ -134,9 +133,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight) { - return fast_divpow2(s.mean, s.weight); + return fast_divpow2(s.mean, weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); @@ -144,10 +144,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight) { // always positive don't need fast divpow2 - return s.variance >> s.weight; + return s.variance >> weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); @@ -155,9 +156,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight) { - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); diff --git a/linux/mempool.c b/linux/mempool.c index 74d4fbb..74ed17b 100644 --- a/linux/mempool.c +++ b/linux/mempool.c @@ -522,6 +522,19 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kvmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kvmalloc); + +void mempool_kvfree(void *element, void *pool_data) +{ + kvfree(element); +} +EXPORT_SYMBOL(mempool_kvfree); + /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. diff --git a/linux/sort.c b/linux/sort.c new file mode 100644 index 0000000..ffa4817 --- /dev/null +++ b/linux/sort.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A fast, small, non-recursive O(n log n) sort for the Linux kernel + * + * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, + * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. + * + * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n + * better) at the expense of stack usage and much larger code to avoid + * quicksort's O(n^2) worst case. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; +} + +/** + * sort_r - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * @priv: third argument passed to comparison function + * + * This function does a heapsort on the given array. You may provide + * a swap_func function if you need to do something more than a memory + * copy (e.g. fix up pointers or auxiliary data), but the built-in swap + * avoids a slow retpoline and so is significantly faster. + * + * Sorting time is O(n log n) both on average and worst-case. While + * quicksort is slightly faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ +void sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + /* pre-scale counters for performance */ + size_t n = num * size, a = (num/2) * size; + const unsigned int lsbit = size & -size; /* Used to find parent */ + + if (!a) /* num < 2 || size == 0 */ + return; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* + * Loop invariants: + * 1. elements [a,n) satisfy the heap property (compare greater than + * all of their children), + * 2. elements [n,num*size) are sorted, and + * 3. a <= b <= c <= d <= n (whenever they are valid). + */ + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + a -= size; + else if (n -= size) /* Sorting: Extract root to --n */ + do_swap(base, base + n, size, swap_func, priv); + else /* Sort complete */ + break; + + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + size, (d = c + size) < n;) + b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0) + b = parent(b, lsbit, size); + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = parent(b, lsbit, size); + do_swap(base + b, base + c, size, swap_func, priv); + } + } +} +EXPORT_SYMBOL(sort_r); + +#include + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, c, r; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } +} +EXPORT_SYMBOL_GPL(eytzinger0_sort_r); + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} +EXPORT_SYMBOL_GPL(eytzinger0_sort); diff --git a/linux/time_stats.c b/linux/time_stats.c new file mode 100644 index 0000000..0b90c80 --- /dev/null +++ b/linux/time_stats.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ + { "eon", U64_MAX }, +}; + +const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} +EXPORT_SYMBOL_GPL(pick_time_units); + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + bool initted = stats->last_event != 0; + + if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (quantiles) + quantiles_update(quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __time_stats_clear_buffer(struct time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} +EXPORT_SYMBOL_GPL(__time_stats_clear_buffer); + +static noinline void time_stats_clear_buffer(struct time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __time_stats_update(struct time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} +EXPORT_SYMBOL_GPL(__time_stats_update); + +#include + +static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name); +} + +static inline u64 time_stats_lifetime(const struct time_stats *stats) +{ + return local_clock() - stats->start_time; +} + +void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats, + const char *epoch_name, unsigned int flags) +{ + struct quantiles *quantiles = time_stats_to_quantiles(stats); + s64 f_mean = 0, d_mean = 0; + u64 f_stddev = 0, d_stddev = 0; + u64 lifetime = time_stats_lifetime(stats); + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + + if (stats->freq_stats.n) { + /* avoid divide by zero */ + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } else if (flags & TIME_STATS_PRINT_NO_ZEROES) { + /* unless we didn't want zeroes anyway */ + return; + } + + seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n); + seq_buf_printf(out, "lifetime: "); + seq_buf_time_units_aligned(out, lifetime); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " since %-12s recent\n", epoch_name); + + seq_buf_printf(out, "duration of events\n"); + + seq_buf_printf(out, " min: "); + seq_buf_time_units_aligned(out, stats->min_duration); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " max: "); + seq_buf_time_units_aligned(out, stats->max_duration); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " total: "); + seq_buf_time_units_aligned(out, stats->total_duration); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " mean: "); + seq_buf_time_units_aligned(out, d_mean); + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " stddev: "); + seq_buf_time_units_aligned(out, d_stddev); + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, "time between events\n"); + + seq_buf_printf(out, " min: "); + seq_buf_time_units_aligned(out, stats->min_freq); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " max: "); + seq_buf_time_units_aligned(out, stats->max_freq); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " mean: "); + seq_buf_time_units_aligned(out, f_mean); + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); + seq_buf_printf(out, "\n"); + + seq_buf_printf(out, " stddev: "); + seq_buf_time_units_aligned(out, f_stddev); + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); + seq_buf_printf(out, "\n"); + + if (quantiles) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + pick_time_units(quantiles->entries[i].m); + u64 last_q = 0; + + seq_buf_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs)); + if (is_last) + seq_buf_printf(out, "\n"); + last_q = q; + } + } +} +EXPORT_SYMBOL_GPL(time_stats_to_seq_buf); + +void time_stats_to_json(struct seq_buf *out, struct time_stats *stats, + const char *epoch_name, unsigned int flags) +{ + struct quantiles *quantiles = time_stats_to_quantiles(stats); + s64 f_mean = 0, d_mean = 0; + u64 f_stddev = 0, d_stddev = 0; + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + + if (stats->freq_stats.n) { + /* avoid divide by zero */ + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } else if (flags & TIME_STATS_PRINT_NO_ZEROES) { + /* unless we didn't want zeroes anyway */ + return; + } + + seq_buf_printf(out, "{\n"); + seq_buf_printf(out, " \"epoch\": \"%s\",\n", epoch_name); + seq_buf_printf(out, " \"count\": %llu,\n", stats->duration_stats.n); + + seq_buf_printf(out, " \"duration_ns\": {\n"); + seq_buf_printf(out, " \"min\": %llu,\n", stats->min_duration); + seq_buf_printf(out, " \"max\": %llu,\n", stats->max_duration); + seq_buf_printf(out, " \"total\": %llu,\n", stats->total_duration); + seq_buf_printf(out, " \"mean\": %llu,\n", d_mean); + seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev); + seq_buf_printf(out, " },\n"); + + d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT); + d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT); + + seq_buf_printf(out, " \"duration_ewma_ns\": {\n"); + seq_buf_printf(out, " \"mean\": %llu,\n", d_mean); + seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev); + seq_buf_printf(out, " },\n"); + + seq_buf_printf(out, " \"frequency_ns\": {\n"); + seq_buf_printf(out, " \"min\": %llu,\n", stats->min_freq); + seq_buf_printf(out, " \"max\": %llu,\n", stats->max_freq); + seq_buf_printf(out, " \"mean\": %llu,\n", f_mean); + seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev); + seq_buf_printf(out, " },\n"); + + f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT); + f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT); + + seq_buf_printf(out, " \"frequency_ewma_ns\": {\n"); + seq_buf_printf(out, " \"mean\": %llu,\n", f_mean); + seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev); + + if (quantiles) { + u64 last_q = 0; + + /* close frequency_ewma_ns but signal more items */ + seq_buf_printf(out, " },\n"); + + seq_buf_printf(out, " \"quantiles_ns\": [\n"); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + seq_buf_printf(out, " %llu", q); + if (!is_last) + seq_buf_printf(out, ", "); + last_q = q; + } + seq_buf_printf(out, " ]\n"); + } else { + /* close frequency_ewma_ns without dumping further */ + seq_buf_printf(out, " }\n"); + } + + seq_buf_printf(out, "}\n"); +} +EXPORT_SYMBOL_GPL(time_stats_to_json); + +void time_stats_exit(struct time_stats *stats) +{ + free_percpu(stats->buffer); +} +EXPORT_SYMBOL_GPL(time_stats_exit); + +void time_stats_init(struct time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + stats->start_time = local_clock(); + spin_lock_init(&stats->lock); +} +EXPORT_SYMBOL_GPL(time_stats_init); + +MODULE_AUTHOR("Kent Overstreet"); +MODULE_LICENSE("GPL"); diff --git a/src/wrappers/handle.rs b/src/wrappers/handle.rs index 336a029..48148a8 100644 --- a/src/wrappers/handle.rs +++ b/src/wrappers/handle.rs @@ -61,7 +61,7 @@ impl BcachefsHandle { pub fn create_subvolume>(&self, dst: P) -> Result<(), Errno> { let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string"); self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume { - dirfd: libc::AT_FDCWD, + dirfd: libc::AT_FDCWD as u32, mode: 0o777, dst_ptr: dst.as_ptr() as u64, ..Default::default() @@ -73,7 +73,7 @@ impl BcachefsHandle { pub fn delete_subvolume>(&self, dst: P) -> Result<(), Errno> { let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string"); self.ioctl(BcachefsIoctl::SubvolumeDestroy, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume { - dirfd: libc::AT_FDCWD, + dirfd: libc::AT_FDCWD as u32, mode: 0o777, dst_ptr: dst.as_ptr() as u64, ..Default::default() @@ -88,7 +88,7 @@ impl BcachefsHandle { let res = self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume { flags: BCH_SUBVOL_SNAPSHOT_CREATE | extra_flags, - dirfd: libc::AT_FDCWD, + dirfd: libc::AT_FDCWD as u32, mode: 0o777, src_ptr: src.as_ref().map_or(0, |x| x.as_ptr() as u64), //src_ptr: if let Some(src) = src { src.as_ptr() } else { std::ptr::null() } as u64, -- 2.39.2