From a2094890a90a2f865e49f94e8448deca7e5852ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 28 Mar 2021 17:38:28 -0400 Subject: [PATCH] Update bcachefs sources to 18686af684 bcachefs: Inode backpointers --- .bcachefs_revision | 2 +- Makefile | 4 + cmd_debug.c | 4 +- include/linux/list_nulls.h | 145 ++++ include/linux/overflow.h | 346 ++++++++ include/linux/poison.h | 85 ++ include/linux/random.h | 1 + include/linux/rcupdate.h | 28 + include/linux/rhashtable-types.h | 135 +++ include/linux/rhashtable.h | 1205 ++++++++++++++++++++++----- include/linux/six.h | 1 + include/linux/slab.h | 1 + include/linux/types.h | 2 + libbcachefs/bcachefs_format.h | 31 +- libbcachefs/bkey.c | 25 +- libbcachefs/bkey.h | 78 +- libbcachefs/bkey_methods.c | 46 +- libbcachefs/bkey_sort.c | 2 +- libbcachefs/bset.c | 84 +- libbcachefs/bset.h | 22 +- libbcachefs/btree_cache.c | 12 +- libbcachefs/btree_gc.c | 28 +- libbcachefs/btree_gc.h | 10 +- libbcachefs/btree_io.c | 32 +- libbcachefs/btree_io.h | 30 +- libbcachefs/btree_iter.c | 103 ++- libbcachefs/btree_iter.h | 4 + libbcachefs/btree_key_cache.c | 185 ++-- libbcachefs/btree_key_cache.h | 8 +- libbcachefs/btree_types.h | 32 +- libbcachefs/btree_update.h | 2 + libbcachefs/btree_update_interior.c | 79 +- libbcachefs/btree_update_leaf.c | 45 +- libbcachefs/debug.c | 10 +- libbcachefs/dirent.c | 18 +- libbcachefs/dirent.h | 6 +- libbcachefs/ec.c | 1 + libbcachefs/extents.c | 7 +- libbcachefs/extents.h | 18 + libbcachefs/fs-common.c | 68 +- libbcachefs/fsck.c | 43 + libbcachefs/inode.c | 19 +- libbcachefs/inode.h | 3 +- libbcachefs/io.c | 5 + libbcachefs/journal.c | 11 +- libbcachefs/journal_io.c | 2 +- libbcachefs/journal_reclaim.c | 4 +- libbcachefs/recovery.c | 24 +- libbcachefs/tests.c | 29 +- linux/rhashtable.c | 1081 ++++++++++++++++++++---- linux/six.c | 35 +- 51 files changed, 3435 insertions(+), 766 deletions(-) create mode 100644 include/linux/list_nulls.h create mode 100644 include/linux/overflow.h create mode 100644 include/linux/poison.h create mode 100644 include/linux/rhashtable-types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 976139a..385c19f 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ad68801b939cdda0530f54cd07b3212e98fe1d75 +18686af68412ebfad9c2adc6ee976ffdb9e1b886 diff --git a/Makefile b/Makefile index 6999b93..3fe9604 100644 --- a/Makefile +++ b/Makefile @@ -156,6 +156,10 @@ update-bcachefs-sources: git add linux/six.c cp $(LINUX_DIR)/include/linux/six.h include/linux/ git add include/linux/six.h + cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/ + git add include/linux/list_nulls.h + cp $(LINUX_DIR)/include/linux/poison.h include/linux/ + git add include/linux/poison.h $(RM) libbcachefs/*.mod.c git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision git add .bcachefs_revision diff --git a/cmd_debug.c b/cmd_debug.c index 3baa697..4938ec0 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -323,9 +323,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) le64_to_cpu(i->journal_seq)); offset += sectors; - for (k = i->start; - k != vstruct_last(i); - k = bkey_next_skip_noops(k, vstruct_last(i))) { + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) { struct bkey u; char buf[4096]; diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h new file mode 100644 index 0000000..fa6e847 --- /dev/null +++ b/include/linux/list_nulls.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIST_NULLS_H +#define _LINUX_LIST_NULLS_H + +#include +#include + +/* + * Special version of lists, where end of list is not a NULL pointer, + * but a 'nulls' marker, which can have many different values. + * (up to 2^31 different values guaranteed on all platforms) + * + * In the standard hlist, termination of a list is the NULL pointer. + * In this special 'nulls' variant, we use the fact that objects stored in + * a list are aligned on a word (4 or 8 bytes alignment). + * We therefore use the last significant bit of 'ptr' : + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) + * Set to 0 : This is a pointer to some object (ptr) + */ + +struct hlist_nulls_head { + struct hlist_nulls_node *first; +}; + +struct hlist_nulls_node { + struct hlist_nulls_node *next, **pprev; +}; +#define NULLS_MARKER(value) (1UL | (((long)value) << 1)) +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ + ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) + +#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_nulls_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ + }) +/** + * ptr_is_a_nulls - Test if a ptr is a nulls + * @ptr: ptr to be tested + * + */ +static inline int is_a_nulls(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr & 1); +} + +/** + * get_nulls_value - Get the 'nulls' value of the end of chain + * @ptr: end of chain + * + * Should be called only if is_a_nulls(ptr); + */ +static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr) >> 1; +} + +/** + * hlist_nulls_unhashed - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. + */ +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) +{ + return !h->pprev; +} + +/** + * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this + * function may be used locklessly. + */ +static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) +{ + return !READ_ONCE(h->pprev); +} + +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) +{ + return is_a_nulls(READ_ONCE(h->first)); +} + +static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *first = h->first; + + n->next = first; + WRITE_ONCE(n->pprev, &h->first); + h->first = n; + if (!is_a_nulls(first)) + WRITE_ONCE(first->pprev, &n->next); +} + +static inline void __hlist_nulls_del(struct hlist_nulls_node *n) +{ + struct hlist_nulls_node *next = n->next; + struct hlist_nulls_node **pprev = n->pprev; + + WRITE_ONCE(*pprev, next); + if (!is_a_nulls(next)) + WRITE_ONCE(next->pprev, pprev); +} + +static inline void hlist_nulls_del(struct hlist_nulls_node *n) +{ + __hlist_nulls_del(n); + WRITE_ONCE(n->pprev, LIST_POISON2); +} + +/** + * hlist_nulls_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry_from(tpos, pos, member) \ + for (; (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +#endif diff --git a/include/linux/overflow.h b/include/linux/overflow.h new file mode 100644 index 0000000..ef74051 --- /dev/null +++ b/include/linux/overflow.h @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#include +#include + +/* + * In the fallback code below, we need to compute the minimum and + * maximum values representable in a given type. These macros may also + * be useful elsewhere, so we provide them outside the + * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. + * + * It would seem more obvious to do something like + * + * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) + * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) + * + * Unfortunately, the middle expressions, strictly speaking, have + * undefined behaviour, and at least some versions of gcc warn about + * the type_max expression (but not if -fsanitize=undefined is in + * effect; in that case, the warning is deferred to runtime...). + * + * The slightly excessive casting in type_min is to make sure the + * macros also produce sensible values for the exotic type _Bool. [The + * overflow checkers only almost work for _Bool, but that's + * a-feature-not-a-bug, since people shouldn't be doing arithmetic on + * _Bools. Besides, the gcc builtins don't allow _Bool* as third + * argument.] + * + * Idea stolen from + * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - + * credit to Christian Biere. + */ +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + +/* + * Avoids triggering -Wtype-limits compilation warning, + * while using unsigned data types to check a < 0. + */ +#define is_non_negative(a) ((a) > 0 || (a) == 0) +#define is_negative(a) (!(is_non_negative(a))) + +/* + * Allows for effectively applying __must_check to a macro so we can have + * both the type-agnostic benefits of the macros while also being able to + * enforce that the return value is, in fact, checked. + */ +static inline bool __must_check __must_check_overflow(bool overflow) +{ + return unlikely(overflow); +} + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +/* + * For simplicity and code hygiene, the fallback code below insists on + * a, b and *d having the same type (similar to the min() and max() + * macros), whereas gcc's type-generic overflow checkers accept + * different types. Hence we don't just make check_add_overflow an + * alias for __builtin_add_overflow, but add type checks similar to + * below. + */ +#define check_add_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_add_overflow(__a, __b, __d); \ +})) + +#define check_sub_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_sub_overflow(__a, __b, __d); \ +})) + +#define check_mul_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +})) + +#else + + +/* Checking for unsigned overflow is relatively easy without causing UB. */ +#define __unsigned_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a + __b; \ + *__d < __a; \ +}) +#define __unsigned_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a - __b; \ + __a < __b; \ +}) +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * For signed types, detecting overflow is much harder, especially if + * we want to avoid UB. But the interface of these macros is such that + * we must provide a result in *d, and in fact we must produce the + * result promised by gcc's builtins, which is simply the possibly + * wrapped-around value. Fortunately, we can just formally do the + * operations in the widest relevant unsigned type (u64) and then + * truncate the result - gcc is smart enough to generate the same code + * with and without the (u64) casts. + */ + +/* + * Adding two signed integers can overflow only if they have the same + * sign, and overflow has happened iff the result has the opposite + * sign. + */ +#define __signed_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a + (u64)__b; \ + (((~(__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Subtraction is similar, except that overflow can now happen only + * when the signs are opposite. In this case, overflow has happened if + * the result has the opposite sign of a. + */ +#define __signed_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a - (u64)__b; \ + ((((__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a * (u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + + +#define check_add_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_add_overflow(a, b, d), \ + __unsigned_add_overflow(a, b, d))) + +#define check_sub_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_sub_overflow(a, b, d), \ + __unsigned_sub_overflow(a, b, d))) + +#define check_mul_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d))) + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +/** check_shl_overflow() - Calculate a left-shifted value and check overflow + * + * @a: Value to be shifted + * @s: How many bits left to shift + * @d: Pointer to where to store the result + * + * Computes *@d = (@a << @s) + * + * Returns true if '*d' cannot hold the result or when 'a << s' doesn't + * make sense. Example conditions: + * - 'a << s' causes bits to be lost when stored in *d. + * - 's' is garbage (e.g. negative) or so large that the result of + * 'a << s' is guaranteed to be 0. + * - 'a' is negative. + * - 'a << s' sets the sign bit, if any, in '*d'. + * + * '*d' will hold the results of the attempted shift, but is not + * considered "safe for use" if false is returned. + */ +#define check_shl_overflow(a, s, d) __must_check_overflow(({ \ + typeof(a) _a = a; \ + typeof(s) _s = s; \ + typeof(d) _d = d; \ + u64 _a_full = _a; \ + unsigned int _to_shift = \ + is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ + *_d = (_a_full << _to_shift); \ + (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ + (*_d >> _to_shift) != _a); \ +})) + +/** + * array_size() - Calculate size of 2-dimensional array. + * + * @a: dimension one + * @b: dimension two + * + * Calculates size of 2-dimensional array: @a * @b. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array_size(size_t a, size_t b) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * array3_size() - Calculate size of 3-dimensional array. + * + * @a: dimension one + * @b: dimension two + * @c: dimension three + * + * Calculates size of 3-dimensional array: @a * @b * @c. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_mul_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/* + * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for + * struct_size() below. + */ +static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_add_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * struct_size() - Calculate size of structure with trailing array. + * @p: Pointer to the structure. + * @member: Name of the array member. + * @count: Number of elements in the array. + * + * Calculates size of memory needed for structure @p followed by an + * array of @count number of @member elements. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size(p, member, count) \ + __ab_c_size(count, \ + sizeof(*(p)->member) + __must_be_array((p)->member),\ + sizeof(*(p))) + +/** + * flex_array_size() - Calculate size of a flexible array member + * within an enclosing structure. + * + * @p: Pointer to the structure. + * @member: Name of the flexible array member. + * @count: Number of elements in the array. + * + * Calculates size of a flexible array of @count number of @member + * elements, at the end of structure @p. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define flex_array_size(p, member, count) \ + array_size(count, \ + sizeof(*(p)->member) + __must_be_array((p)->member)) + +#endif /* __LINUX_OVERFLOW_H */ diff --git a/include/linux/poison.h b/include/linux/poison.h new file mode 100644 index 0000000..dc8ae5d --- /dev/null +++ b/include/linux/poison.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_POISON_H +#define _LINUX_POISON_H + +/********** include/linux/list.h **********/ + +/* + * Architectures might want to move the poison pointer offset + * into some well-recognized area such as 0xdead000000000000, + * that is also not mappable by user-space exploits: + */ +#ifdef CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) +#else +# define POISON_POINTER_DELTA 0 +#endif + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA) + +/********** include/linux/timer.h **********/ +#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) + +/********** mm/page_poison.c **********/ +#ifdef CONFIG_PAGE_POISONING_ZERO +#define PAGE_POISON 0x00 +#else +#define PAGE_POISON 0xaa +#endif + +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA) + +/********** mm/slab.c **********/ +/* + * Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ +#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ + +#define SLUB_RED_INACTIVE 0xbb +#define SLUB_RED_ACTIVE 0xcc + +/* ...and for poisoning */ +#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ +#define POISON_FREE 0x6b /* for use-after-free poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +/********** arch/$ARCH/mm/init.c **********/ +#define POISON_FREE_INITMEM 0xcc + +/********** arch/ia64/hp/common/sba_iommu.c **********/ +/* + * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a + * value of "SBAIOMMU POISON\0" for spill-over poisoning. + */ + +/********** fs/jbd/journal.c **********/ +#define JBD_POISON_FREE 0x5b +#define JBD2_POISON_FREE 0x5c + +/********** drivers/base/dmapool.c **********/ +#define POOL_POISON_FREED 0xa7 /* !inuse */ +#define POOL_POISON_ALLOCATED 0xa9 /* !initted */ + +/********** drivers/atm/ **********/ +#define ATM_POISON_FREE 0x12 +#define ATM_POISON 0xdeadbeef + +/********** kernel/mutexes **********/ +#define MUTEX_DEBUG_INIT 0x11 +#define MUTEX_DEBUG_FREE 0x22 +#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA) + +/********** security/ **********/ +#define KEY_DESTROY 0xbd + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index c38ae46..28c595a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -45,6 +45,7 @@ static inline type get_random_##type(void) \ get_random_type(int); get_random_type(long); +get_random_type(u32); get_random_type(u64); #endif /* _LINUX_RANDOM_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c99d78a..ae29224 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -13,4 +13,32 @@ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) +/* Has the specified rcu_head structure been handed to call_rcu()? */ + +/** + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() + * @rhp: The rcu_head structure to initialize. + * + * If you intend to invoke rcu_head_after_call_rcu() to test whether a + * given rcu_head structure has already been passed to call_rcu(), then + * you must also invoke this rcu_head_init() function on it just after + * allocating that structure. Calls to this function must not race with + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. + */ +static inline void rcu_head_init(struct rcu_head *rhp) +{ + rhp->func = (void *)~0L; +} + +static inline bool +rcu_head_after_call_rcu(struct rcu_head *rhp, + void (*f)(struct rcu_head *head)) +{ + void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func); + + if (func == f) + return true; + return false; +} + #endif /* __TOOLS_LINUX_RCUPDATE_H */ diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 0000000..57467cb --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @automatic_shrinking: Enable automatic shrinking of tables + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8dbe153..6cf8c25 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,7 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * - * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2015-2016 Herbert Xu * Copyright (c) 2014-2015 Thomas Graf * Copyright (c) 2008-2014 Patrick McHardy * @@ -17,92 +18,93 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include -#include #include #include #include -#include -#include -#include +#include #include +#include +#include +#include -#define RHT_BASE_BITS 4 -#define RHT_HASH_BITS 27 -#define RHT_BASE_SHIFT RHT_HASH_BITS -#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) +#define BIT(nr) (1UL << (nr)) -struct rhash_head { - struct rhash_head __rcu *next; -}; +#include +/* + * Objects in an rhashtable have an embedded struct rhash_head + * which is linked into as hash chain from the hash table - or one + * of two or more hash tables when the rhashtable is being resized. + * The end of the chain is marked with a special nulls marks which has + * the least significant bit set but otherwise stores the address of + * the hash bucket. This allows us to be sure we've found the end + * of the right list. + * The value stored in the hash bucket has BIT(0) used as a lock bit. + * This bit must be atomically set before any changes are made to + * the chain. To avoid dereferencing this pointer without clearing + * the bit first, we use an opaque 'struct rhash_lock_head *' for the + * pointer stored in the bucket. This struct needs to be defined so + * that rcu_dereference() works on it, but it has no content so a + * cast is needed for it to be useful. This ensures it isn't + * used by mistake with clearing the lock bit first. + */ +struct rhash_lock_head {}; +/* Maximum chain length before rehash + * + * The maximum (not average) chain length grows with the size of the hash + * table, at a rate of (log N)/(log log N). + * + * The value of 16 is selected so that even if the hash table grew to + * 2^32 you would not expect the maximum chain length to exceed it + * unless we are under attack (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need to set it to a + * lower value as you'd need the chain length to vastly exceed 16 to have + * any real effect on the system. + */ +#define RHT_ELASTICITY 16u + +/** + * struct bucket_table - Table of hash buckets + * @size: Number of hash buckets + * @nest: Number of bits of first-level nested table. + * @rehash: Current bucket being rehashed + * @hash_rnd: Random seed to fold into hash + * @walkers: List of active walkers + * @rcu: RCU structure for freeing the table + * @future_tbl: Table under construction during rehashing + * @ntbl: Nested table used when out of memory. + * @buckets: size * hash buckets + */ struct bucket_table { unsigned int size; - unsigned int rehash; + unsigned int nest; u32 hash_rnd; - unsigned int locks_mask; - spinlock_t *locks; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; -}; - -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable_params { - size_t nelem_hint; - size_t key_len; - size_t key_offset; - size_t head_offset; - unsigned int insecure_max_entries; - unsigned int max_size; - unsigned int min_size; - u32 nulls_base; - bool insecure_elasticity; - bool automatic_shrinking; - size_t locks_mul; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -struct rhashtable { - struct bucket_table __rcu *tbl; - atomic_t nelems; - unsigned int key_len; - unsigned int elasticity; - struct rhashtable_params p; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; -}; - -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -#define NULLS_MARKER(value) (1UL | (((long)value) << 1)) - -static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) -{ - return NULLS_MARKER(ht->p.nulls_base + hash); -} - -#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ - ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) +/* + * NULLS_MARKER() expects a hash value with the low + * bits mostly likely to be significant, and it discards + * the msb. + * We give it an address, in which the bottom bit is + * always 0, and the msb might be significant. + * So we shift the address down one bit to align with + * expectations and avoid losing a significant bit. + * + * We never store the NULLS_MARKER in the hash table + * itself as we need the lsb for locking. + * Instead we store a NULL + */ +#define RHT_NULLS_MARKER(ptr) \ + ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) +#define INIT_RHT_NULLS_HEAD(ptr) \ + ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { @@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht, static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { - return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1); + return hash & (tbl->size - 1); } -static inline unsigned int rht_key_hashfn( - struct rhashtable *ht, const struct bucket_table *tbl, - const void *key, const struct rhashtable_params params) +static inline unsigned int rht_key_get_hash(struct rhashtable *ht, + const void *key, const struct rhashtable_params params, + unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) - hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd); + hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) - hash = params.hashfn(key, key_len, tbl->hash_rnd); + hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) - hash = jhash(key, key_len, tbl->hash_rnd); + hash = jhash(key, key_len, hash_rnd); else - hash = jhash2(key, key_len / sizeof(u32), - tbl->hash_rnd); + hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) - hash = params.hashfn(key, key_len, tbl->hash_rnd); + hash = params.hashfn(key, key_len, hash_rnd); else - hash = jhash(key, key_len, tbl->hash_rnd); + hash = jhash(key, key_len, hash_rnd); } + return hash; +} + +static inline unsigned int rht_key_hashfn( + struct rhashtable *ht, const struct bucket_table *tbl, + const void *key, const struct rhashtable_params params) +{ + unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); + return rht_bucket_index(tbl, hash); } @@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn( rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } +/** + * rht_grow_above_75 - returns true if nelems > 0.75 * table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht, tbl->size > ht->p.min_size; } +/** + * rht_grow_above_100 - returns true if nelems > table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_grow_above_max - returns true if table is above maximum + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { - return ht->p.insecure_max_entries && - atomic_read(&ht->nelems) >= ht->p.insecure_max_entries; + return atomic_read(&ht->nelems) >= ht->max_elems; } -static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl, - unsigned int hash) +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rht_mutex_is_held(struct rhashtable *ht); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); +#else +static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { - return &tbl->locks[hash & tbl->locks_mask]; + return 1; } -int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *, - const void *, - struct rhash_head *, - struct bucket_table *); +static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, + u32 hash) +{ + return 1; +} +#endif /* CONFIG_PROVE_LOCKING */ + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj); -int rhashtable_init(struct rhashtable *, const struct rhashtable_params *); -void rhashtable_destroy(struct rhashtable *); +void rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter); +void rhashtable_walk_exit(struct rhashtable_iter *iter); +int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); -#define rht_dereference(p, ht) rcu_dereference(p) -#define rht_dereference_rcu(p, ht) rcu_dereference(p) -#define rht_dereference_bucket(p, tbl, hash) rcu_dereference(p) -#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p) +static inline void rhashtable_walk_start(struct rhashtable_iter *iter) +{ + (void)rhashtable_walk_start_check(iter); +} + +void *rhashtable_walk_next(struct rhashtable_iter *iter); +void *rhashtable_walk_peek(struct rhashtable_iter *iter); +void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); + +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg); +void rhashtable_destroy(struct rhashtable *ht); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); + +#define rht_dereference(p, ht) \ + rcu_dereference(p) + +#define rht_dereference_rcu(p, ht) \ + rcu_dereference(p) + +#define rht_dereference_bucket(p, tbl, hash) \ + rcu_dereference(p) + +#define rht_dereference_bucket_rcu(p, tbl, hash) \ + rcu_dereference(p) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -#define rht_for_each_continue(pos, head, tbl, hash) \ - for (pos = rht_dereference_bucket(head, tbl, hash); \ - !rht_is_a_nulls(pos); \ +static inline struct rhash_lock_head __rcu *const *rht_bucket( + const struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : + &tbl->buckets[hash]; +} + +/* + * We lock a bucket by setting BIT(0) in the pointer - this is always + * zero in real pointers. The NULLS mark is never stored in the bucket, + * rather we store NULL if the bucket is empty. + * bit_spin_locks do not handle contention well, but the whole point + * of the hashtable design is to achieve minimum per-bucket contention. + * A nested hash table might not have a bucket pointer. In that case + * we cannot get a lock. For remove and replace the bucket cannot be + * interesting and doesn't need locking. + * For insert we allocate the bucket if this is the last bucket_table, + * and then take the lock. + * Sometimes we unlock a bucket by writing a new pointer there. In that + * case we don't need to unlock, but we do need to reset state such as + * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() + * provides the same release semantics that bit_spin_unlock() provides, + * this is safe. + * When we write to a bucket without unlocking, we use rht_assign_locked(). + */ + +static inline void rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) +{ + bit_spin_lock(0, (unsigned long *)bkt); +} + +static inline void rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) +{ + bit_spin_lock(0, (unsigned long *)bucket); +} + +static inline void rht_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) +{ + bit_spin_unlock(0, (unsigned long *)bkt); +} + +static inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +{ + return (struct rhash_head *) + ((unsigned long)p & ~BIT(0) ?: + (unsigned long)RHT_NULLS_MARKER(bkt)); +} + +/* + * Where 'bkt' is a bucket and might be locked: + * rht_ptr_rcu() dereferences that pointer and clears the lock bit. + * rht_ptr() dereferences in a context where the bucket is locked. + * rht_ptr_exclusive() dereferences in a context where exclusive + * access is guaranteed, such as when destroying the table. + */ +static inline struct rhash_head *rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference(*bkt), bkt); +} + +static inline struct rhash_head *rht_ptr( + struct rhash_lock_head __rcu *const *bkt, + struct bucket_table *tbl, + unsigned int hash) +{ + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); +} + +static inline struct rhash_head *rht_ptr_exclusive( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference(*bkt), bkt); +} + +static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); +} + +static inline void rht_assign_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*bkt, (void *)obj); + preempt_enable(); + __release(bitlock); +} + +/** + * rht_for_each_from - iterate over hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ +#define rht_for_each_from(pos, head, tbl, hash) \ + for (pos = head; \ + !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) +/** + * rht_for_each - iterate over hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash) + +/** + * rht_for_each_entry_from - iterate over hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ + for (pos = head; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket((pos)->next, tbl, hash)) -#define rht_for_each_rcu_continue(pos, head, tbl, hash) \ +/** + * rht_for_each_entry - iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_from(tpos, pos, \ + rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash, member) + +/** + * rht_for_each_entry_safe - safely iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @next: the &struct rhash_head to use as next in loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive allows for the looped code to + * remove the loop cursor from the list. + */ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL) + +/** + * rht_for_each_rcu_from - iterate over rcu hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) -#define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) +/** + * rht_for_each_rcu - iterate over rcu hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu(pos, tbl, hash) \ + for (({barrier(); }), \ + pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ + !rht_is_a_nulls(pos); \ + pos = rcu_dereference_raw(pos->next)) -#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \ +/** + * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ - tbl, hash, member) +/** + * rht_for_each_entry_rcu - iterate over rcu hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_from(tpos, pos, \ + rht_ptr_rcu(rht_bucket(tbl, hash)), \ + tbl, hash, member) + +/** + * rhl_for_each_rcu - iterate over rcu hash table list + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_rcu(pos, list) \ + for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + +/** + * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * @member: name of the &struct rlist_head within the hashable struct. + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) @@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } -static inline void *rhashtable_lookup_fast( +/* Internal function, do not use. */ +static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast( .ht = ht, .key = key, }; - const struct bucket_table *tbl; + struct rhash_lock_head __rcu *const *bkt; + struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; - rcu_read_lock(); - tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); - rht_for_each_rcu(he, tbl, hash) { - if (params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, he)) : - rhashtable_compare(&arg, rht_obj(ht, he))) - continue; - rcu_read_unlock(); - return rht_obj(ht, he); - } + bkt = rht_bucket(tbl, hash); + do { + rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return he; + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); @@ -286,149 +607,593 @@ restart: tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; - rcu_read_unlock(); return NULL; } -static inline int __rhashtable_insert_fast( - struct rhashtable *ht, const void *key, struct rhash_head *obj, +/** + * rhashtable_lookup - search hash table + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * This must only be called under the RCU read lock. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params); + + return he ? rht_obj(ht, he) : NULL; +} + +/** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * Only use this function when you have other mechanisms guaranteeing + * that the object won't go away after the RCU read lock is released. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup_fast( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + void *obj; + + rcu_read_lock(); + obj = rhashtable_lookup(ht, key, params); + rcu_read_unlock(); + + return obj; +} + +/** + * rhltable_lookup - search hash list table + * @hlt: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. All matching entries are returned + * in a list. + * + * This must only be called under the RCU read lock. + * + * Returns the list of entries that match the given key. + */ +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; +} + +/* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes in there is a clash, + * otherwise it returns an error via ERR_PTR(). + */ +static inline void *__rhashtable_insert_fast( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; - struct bucket_table *tbl, *new_tbl; + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct bucket_table *tbl; struct rhash_head *head; - spinlock_t *lock; - unsigned int elasticity; unsigned int hash; - int err; + int elasticity; + void *data; -restart: rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); + hash = rht_head_hashfn(ht, tbl, obj, params); + elasticity = RHT_ELASTICITY; + bkt = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!bkt) + goto out; + pprev = NULL; + rht_lock(tbl, bkt); - /* All insertions must grab the oldest table containing - * the hashed bucket that is yet to be rehashed. - */ - for (;;) { - hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); - spin_lock_bh(lock); + if (unlikely(rcu_access_pointer(tbl->future_tbl))) { +slow_path: + rht_unlock(tbl, bkt); + rcu_read_unlock(); + return rhashtable_insert_slow(ht, key, obj); + } - if (tbl->rehash <= hash) - break; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *plist; + struct rhlist_head *list; - spin_unlock_bh(lock); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); - } + elasticity--; + if (!key || + (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); - if (!IS_ERR_OR_NULL(tbl)) - goto slow_path; + data = rht_obj(ht, head); - err = PTR_ERR(tbl); - goto out; - } + if (!rhlist) + goto out_unlock; - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto out; - if (unlikely(rht_grow_above_100(ht, tbl))) { -slow_path: - spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht, tbl); - rcu_read_unlock(); - if (err) - return err; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - goto restart; + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt); + } else + rht_assign_unlock(tbl, bkt, obj); + data = NULL; + goto out; } - err = -EEXIST; - elasticity = ht->elasticity; - rht_for_each(head, tbl, hash) { - if (key && - unlikely(!(params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) - goto out; - if (!--elasticity) - goto slow_path; - } + if (elasticity <= 0) + goto slow_path; + + data = ERR_PTR(-E2BIG); + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out_unlock; - err = 0; + if (unlikely(rht_grow_above_100(ht, tbl))) + goto slow_path; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + /* Inserting at head of list makes unlocking free. */ + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (rhlist) { + struct rhlist_head *list; - rcu_assign_pointer(tbl->buckets[hash], obj); + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } atomic_inc(&ht->nelems); + rht_assign_unlock(tbl, bkt, obj); + if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); + data = NULL; out: - spin_unlock_bh(lock); rcu_read_unlock(); - return err; + return data; + +out_unlock: + rht_unlock(tbl, bkt); + goto out; } +/** + * rhashtable_insert_fast - insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhashtable_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhltable_insert_key - insert object into hash list table + * @hlt: hash list table + * @key: the pointer to the key + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params, true)); +} + +/** + * rhltable_insert - insert object into hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + const char *key = rht_obj(&hlt->ht, &list->rhead); + + key += params.key_offset; + + return rhltable_insert_key(hlt, key, list, params); +} + +/** + * rhashtable_lookup_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * This lookup function may only be used for fixed key hash table (key_len + * parameter set). It will BUG() if used inappropriately. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); + void *ret; BUG_ON(ht->p.obj_hashfn); - return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, - params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } -static inline int __rhashtable_remove_fast( +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key = rht_obj(ht, obj); + + BUG_ON(ht->p.obj_hashfn); + + return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); +} + +/** + * rhashtable_lookup_insert_key - search and insert object to hash table + * with explicit key + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Lookups may occur in parallel with hashtable mutations and resizing. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + * + * Returns zero on success. + */ +static inline int rhashtable_lookup_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + BUG_ON(!ht->p.obj_hashfn || !key); + + ret = __rhashtable_insert_fast(ht, key, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_key - lookup and insert object into hash table + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_key(), but this function returns the + * object if it exists, NULL if it does not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + BUG_ON(!ht->p.obj_hashfn || !key); + + return __rhashtable_insert_fast(ht, key, obj, params, false); +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj, const struct rhashtable_params params) + struct rhash_head *obj, const struct rhashtable_params params, + bool rhlist) { + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; - spinlock_t * lock; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + pprev = NULL; + rht_lock(tbl, bkt); - spin_lock_bh(lock); + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + + list = container_of(he, struct rhlist_head, rhead); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { if (he != obj) { + struct rhlist_head __rcu **lpprev; + pprev = &he->next; - continue; + + if (!rhlist) + continue; + + do { + lpprev = &list->next; + list = rht_dereference_bucket(list->next, + tbl, hash); + } while (list && obj != &list->rhead); + + if (!list) + continue; + + list = rht_dereference_bucket(list->next, tbl, hash); + RCU_INIT_POINTER(*lpprev, list); + err = 0; + break; } - rcu_assign_pointer(*pprev, obj->next); + obj = rht_dereference_bucket(obj->next, tbl, hash); + err = 1; + + if (rhlist) { + list = rht_dereference_bucket(list->next, tbl, hash); + if (list) { + RCU_INIT_POINTER(list->rhead.next, obj); + obj = &list->rhead; + err = 0; + } + } + + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt); + } else { + rht_assign_unlock(tbl, bkt, obj); + } + goto unlocked; + } + + rht_unlock(tbl, bkt); +unlocked: + if (err > 0) { + atomic_dec(&ht->nelems); + if (unlikely(ht->p.automatic_shrinking && + rht_shrink_below_30(ht, tbl))) + schedule_work(&ht->run_work); err = 0; - break; } - spin_unlock_bh(lock); + return err; +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, + rhlist)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); return err; } +/** + * rhashtable_remove_fast - remove object from hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30%. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(ht, obj, params, false); +} + +/** + * rhltable_remove - remove object from hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30% + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); +} + +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct rhash_head *he; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + + pprev = NULL; + rht_lock(tbl, bkt); + + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + if (pprev) { + rcu_assign_pointer(*pprev, obj_new); + rht_unlock(tbl, bkt); + } else { + rht_assign_unlock(tbl, bkt, obj_new); + } + err = 0; + goto unlocked; + } + + rht_unlock(tbl, bkt); + +unlocked: + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) { struct bucket_table *tbl; int err; @@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast( * visible then that guarantees the entry to still be in * the old tbl if it exists. */ - while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) && + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; - if (err) - goto out; - - atomic_dec(&ht->nelems); - if (unlikely(ht->p.automatic_shrinking && - rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); - -out: rcu_read_unlock(); return err; } +/** + * rhltable_walk_enter - Initialise an iterator + * @hlt: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + return rhashtable_walk_enter(&hlt->ht, iter); +} + +/** + * rhltable_free_and_destroy - free elements and destroy hash list table + * @hlt: the hash list table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * See documentation for rhashtable_free_and_destroy. + */ +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + return rhltable_free_and_destroy(hlt, NULL, NULL); +} + #endif /* _LINUX_RHASHTABLE_H */ diff --git a/include/linux/six.h b/include/linux/six.h index 0e6df05..477c33e 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -196,6 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); +void six_lock_pcpu_free_rcu(struct six_lock *); void six_lock_pcpu_free(struct six_lock *); void six_lock_pcpu_alloc(struct six_lock *); diff --git a/include/linux/slab.h b/include/linux/slab.h index b8a1235..775b7e3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) #define kzfree(p) free(p) #define kvmalloc(size, flags) kmalloc(size, flags) +#define kvzalloc(size, flags) kzalloc(size, flags) #define kvfree(p) kfree(p) static inline struct page *alloc_pages(gfp_t flags, unsigned int order) diff --git a/include/linux/types.h b/include/linux/types.h index 1e12555..c9886cb 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -11,6 +11,8 @@ #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ #include +#include + #define BITS_PER_LONG __BITS_PER_LONG struct page; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 532f23b..cb22595 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -138,19 +138,18 @@ struct bpos { #define KEY_SNAPSHOT_MAX ((__u32)~0U) #define KEY_SIZE_MAX ((__u32)~0U) -static inline struct bpos POS(__u64 inode, __u64 offset) +static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) { - struct bpos ret; - - ret.inode = inode; - ret.offset = offset; - ret.snapshot = 0; - - return ret; + return (struct bpos) { + .inode = inode, + .offset = offset, + .snapshot = snapshot, + }; } -#define POS_MIN POS(0, 0) -#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) +#define POS_MIN SPOS(0, 0, 0) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS(_inode, _offset) SPOS(_inode, _offset, 0) /* Empty placeholder struct, for container_of() */ struct bch_val { @@ -707,7 +706,9 @@ struct bch_inode_generation { x(bi_foreground_target, 16) \ x(bi_background_target, 16) \ x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -743,6 +744,7 @@ enum { __BCH_INODE_I_SIZE_DIRTY= 5, __BCH_INODE_I_SECTORS_DIRTY= 6, __BCH_INODE_UNLINKED = 7, + __BCH_INODE_BACKPTR_UNTRUSTED = 8, /* bits 20+ reserved for packed fields below: */ }; @@ -755,6 +757,7 @@ enum { #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) +#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -1204,7 +1207,9 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_new_versioning = 10, bcachefs_metadata_version_bkey_renumber = 10, bcachefs_metadata_version_inode_btree_change = 11, - bcachefs_metadata_version_max = 12, + bcachefs_metadata_version_snapshot = 12, + bcachefs_metadata_version_inode_backpointers = 13, + bcachefs_metadata_version_max = 14, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1736,7 +1741,7 @@ struct btree_node { /* Closed interval: */ struct bpos min_key; struct bpos max_key; - struct bch_extent_ptr ptr; + struct bch_extent_ptr _ptr; /* not used anymore */ struct bkey_format format; union { diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index e1906f2..3af5606 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) return "incorrect number of fields"; for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (f->bits_per_field[i] > 64) + if (f->bits_per_field[i] > unpacked_bits) return "field too large"; - if (field_offset && - (f->bits_per_field[i] == 64 || - (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < - field_offset))) + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return "offset + bits overflow"; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) return "offset + bits overflow"; bits += f->bits_per_field[i]; @@ -1045,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, high_word(f, r), b->nr_key_bits); - EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), bkey_unpack_pos(b, r))); return ret; } @@ -1055,7 +1059,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, const struct bkey_packed *l, const struct bpos *r) { - return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); + return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); } __pure __flatten @@ -1076,7 +1080,7 @@ int bch2_bkey_cmp_packed(const struct btree *b, r = (void*) &unpacked; } - return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); } __pure __flatten @@ -1087,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b, const struct bkey *l_unpacked; return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bkey_cmp(l_unpacked->p, *r) + ? bpos_cmp(l_unpacked->p, *r) : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } @@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void) struct bkey_packed p; struct bkey_format test_format = { - .key_u64s = 2, + .key_u64s = 3, .nr_fields = BKEY_NR_FIELDS, .bits_per_field = { 13, 64, + 32, }, }; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 629288a..2e45d88 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -33,16 +33,6 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) -static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, - struct bkey_packed *end) -{ - k = bkey_next(k); - - while (k != end && !k->u64s) - k = (void *) ((u64 *) k + 1); - return k; -} - #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) @@ -150,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } -#if 1 +static __always_inline int bpos_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset) ?: + cmp_int(l.snapshot, r.snapshot); +} + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { - if (l.inode != r.inode) - return l.inode < r.inode ? -1 : 1; - if (l.offset != r.offset) - return l.offset < r.offset ? -1 : 1; - if (l.snapshot != r.snapshot) - return l.snapshot < r.snapshot ? -1 : 1; - return 0; + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset); } -#else -int bkey_cmp(struct bpos l, struct bpos r); -#endif static inline struct bpos bpos_min(struct bpos l, struct bpos r) { - return bkey_cmp(l, r) < 0 ? l : r; + return bpos_cmp(l, r) < 0 ? l : r; } static inline struct bpos bpos_max(struct bpos l, struct bpos r) { - return bkey_cmp(l, r) > 0 ? l : r; + return bpos_cmp(l, r) > 0 ? l : r; } #define sbb(a, b, borrow) \ @@ -200,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b) static inline struct bpos bpos_diff(struct bpos l, struct bpos r) { - if (bkey_cmp(l, r) > 0) + if (bpos_cmp(l, r) > 0) swap(l, r); return bpos_sub(r, l); @@ -262,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format) format->bits_per_field[BKEY_FIELD_SNAPSHOT]; } -static inline struct bpos bkey_successor(struct bpos p) +static inline struct bpos bpos_successor(struct bpos p) { - struct bpos ret = p; + if (!++p.snapshot && + !++p.offset && + !++p.inode) + BUG(); - if (!++ret.offset) - BUG_ON(!++ret.inode); + return p; +} - return ret; +static inline struct bpos bpos_predecessor(struct bpos p) +{ + if (!p.snapshot-- && + !p.offset-- && + !p.inode--) + BUG(); + + return p; } -static inline struct bpos bkey_predecessor(struct bpos p) +static inline struct bpos bpos_nosnap_successor(struct bpos p) { - struct bpos ret = p; + p.snapshot = 0; - if (!ret.offset--) - BUG_ON(!ret.inode--); + if (!++p.offset && + !++p.inode) + BUG(); - return ret; + return p; +} + +static inline struct bpos bpos_nosnap_predecessor(struct bpos p) +{ + p.snapshot = 0; + + if (!p.offset-- && + !p.inode--) + BUG(); + + return p; } static inline u64 bkey_start_offset(const struct bkey *k) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 641169e..6fe95b8 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -119,9 +119,16 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return "nonzero size field"; } - if (k.k->p.snapshot) + if (type != BKEY_TYPE_btree && + !btree_type_has_snapshots(type) && + k.k->p.snapshot) return "nonzero snapshot"; + if (type != BKEY_TYPE_btree && + btree_type_has_snapshots(type) && + k.k->p.snapshot != U32_MAX) + return "invalid snapshot field"; + if (type != BKEY_TYPE_btree && !bkey_cmp(k.k->p, POS_MAX)) return "POS_MAX key"; @@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { - if (bkey_cmp(k.k->p, b->data->min_key) < 0) + if (bpos_cmp(k.k->p, b->data->min_key) < 0) return "key before start of btree node"; - if (bkey_cmp(k.k->p, b->data->max_key) > 0) + if (bpos_cmp(k.k->p, b->data->max_key) > 0) return "key past end of btree node"; return NULL; @@ -165,9 +172,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { - if (!bkey_cmp(pos, POS_MIN)) + if (!bpos_cmp(pos, POS_MIN)) pr_buf(out, "POS_MIN"); - else if (!bkey_cmp(pos, POS_MAX)) + else if (!bpos_cmp(pos, POS_MAX)) pr_buf(out, "POS_MAX"); else { if (pos.inode == U64_MAX) @@ -256,7 +263,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, !ops->key_merge || l.k->type != r.k->type || bversion_cmp(l.k->version, r.k->version) || - bkey_cmp(l.k->p, bkey_start_pos(r.k))) + bpos_cmp(l.k->p, bkey_start_pos(r.k))) return BCH_MERGE_NOMERGE; ret = ops->key_merge(c, l, r); @@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, const struct bkey_ops *ops; struct bkey uk; struct bkey_s u; + unsigned nr_compat = 5; int i; /* * Do these operations in reverse order in the write path: */ - for (i = 0; i < 4; i++) - switch (!write ? i : 3 - i) { + for (i = 0; i < nr_compat; i++) + switch (!write ? i : nr_compat - 1 - i) { case 0: if (big_endian != CPU_BIG_ENDIAN) bch2_bkey_swab_key(f, k); @@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, } break; case 3: + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + struct bkey_i *u = packed_to_bkey(k); + + if (u) { + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { + u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + uk = __bch2_bkey_unpack_key(f, k); + uk.p.snapshot = write + ? min_packed : min_t(u64, U32_MAX, max_packed); + + BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); + } + } + + break; + case 4: if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index f250707..537ab79 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) BUG_ON(!iter->used); - i->k = bkey_next_skip_noops(i->k, i->end); + i->k = bkey_next(i->k); BUG_ON(i->k > i->end); diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 87f951e..3fb9a9e 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, for (_k = i->start; _k < vstruct_last(i); _k = _n) { - _n = bkey_next_skip_noops(_k, vstruct_last(i)); + _n = bkey_next(_k); k = bkey_disassemble(b, _k, &uk); if (c) @@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, n = bkey_unpack_key(b, _n); - if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { + if (bpos_cmp(n.p, k.k->p) < 0) { printk(KERN_ERR "Key skipped backwards\n"); continue; } if (!bkey_deleted(k.k) && - !bkey_cmp(n.p, k.k->p)) + !bpos_cmp(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } } @@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, goto start; while (1) { if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, + BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, bkey_unpack_pos(b, k))); start: if (++j == t->size) @@ -544,7 +544,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + k = bkey_next(k); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, if (is_power_of_2(j) && !min_key->u64s) { - k = (void *) min_key; - bkey_init(&k->k); - k->k.p = b->data->min_key; + if (!bkey_pack_pos(min_key, b->data->min_key, b)) { + k = (void *) min_key; + bkey_init(&k->k); + k->k.p = b->data->min_key; + } } if (is_power_of_2(j + 1) && !max_key->u64s) { - k = (void *) max_key; - bkey_init(&k->k); - k->k.p = t->max_key; + if (!bkey_pack_pos(max_key, b->data->max_key, b)) { + k = (void *) max_key; + bkey_init(&k->k); + k->k.p = t->max_key; + } } __make_bfloat(b, t, j, min_key, max_key); @@ -759,7 +763,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -776,14 +780,19 @@ retry: } while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); t->max_key = bkey_unpack_pos(b, prev); - bkey_init(&min_key.k); - min_key.k.p = b->data->min_key; - bkey_init(&max_key.k); - max_key.k.p = t->max_key; + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + } + + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); + max_key.k.p = t->max_key; + } /* Then we build the tree */ eytzinger1_for_each(j, t->size) @@ -911,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next_skip_noops(i, k)) + for (i = p; i != k; i = bkey_next(i)) if (i->type >= min_key_type) ret = i; @@ -922,10 +931,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, BUG_ON(ret >= orig_k); for (i = ret - ? bkey_next_skip_noops(ret, orig_k) + ? bkey_next(ret) : btree_bkey_first(b, t); i != orig_k; - i = bkey_next_skip_noops(i, orig_k)) + i = bkey_next(i)) BUG_ON(i->type >= min_key_type); } @@ -960,7 +969,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; - if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { + if (bkey_next(k) == btree_bkey_last(b, t)) { t->max_key = bkey_unpack_pos(b, k); for (j = 1; j < t->size; j = j * 2 + 1) @@ -1084,7 +1093,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next_skip_noops(k, end); + k = bkey_next(k); if (k == end) break; @@ -1170,15 +1179,14 @@ void bch2_bset_delete(struct btree *b, __flatten static struct bkey_packed *bset_search_write_set(const struct btree *b, struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *packed_search) + struct bpos *search) { unsigned l = 0, r = t->size; while (l + 1 != r) { unsigned m = (l + r) >> 1; - if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) + if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) l = m; else r = m; @@ -1238,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, prefetch(&base->f[n << 4]); f = &base->f[n]; - - if (!unlikely(packed_search)) - goto slowpath; if (unlikely(f->exponent >= BFLOAT_FAILED)) goto slowpath; @@ -1304,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_NO_AUX_TREE: return btree_bkey_first(b, t); case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search, lossy_packed_search); + return bset_search_write_set(b, t, search); case BSET_RO_AUX_TREE: /* * Each node in the auxiliary search tree covers a certain range @@ -1313,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, * start and end - handle that here: */ - if (bkey_cmp(*search, t->max_key) > 0) + if (bpos_cmp(*search, t->max_key) > 0) return btree_bkey_last(b, t); return bset_search_tree(b, t, search, lossy_packed_search); @@ -1334,12 +1339,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, m, lossy_packed_search, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); @@ -1403,16 +1408,15 @@ noinline __flatten __attribute__((cold)) static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { - struct bset_tree *t; + struct bkey_packed *k; trace_bkey_pack_pos_fail(search); - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, NULL, NULL), - btree_bkey_last(b, t)); + bch2_btree_node_iter_init_from_start(iter, b); - bch2_btree_node_iter_sort(iter, b); + while ((k = bch2_btree_node_iter_peek(iter, b)) && + bkey_iter_pos_cmp(b, k, search) < 0) + bch2_btree_node_iter_advance(iter, b); } /** @@ -1446,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * to the search key is going to have 0 sectors after the search key. * * But this does mean that we can't just search for - * bkey_successor(start_of_range) to get the first extent that overlaps with + * bpos_successor(start_of_range) to get the first extent that overlaps with * the range we want - if we're unlucky and there's an extent that ends * exactly where we searched, then there could be a deleted key at the same * position and we'd get that when we search instead of the preceding extent @@ -1464,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct bkey_packed *k[MAX_BSETS]; unsigned i; - EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); memset(iter, 0, sizeof(*iter)); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 54b364c..506da4e 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ _k != btree_bkey_last(_b, _t); \ - _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) + _k = bkey_next(_k)) static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { @@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, EBUG_ON(r_packed && !bkey_packed(r_packed)); if (unlikely(!bkey_packed(l))) - return bkey_cmp(packed_to_bkey_c(l)->p, *r); + return bpos_cmp(packed_to_bkey_c(l)->p, *r); if (likely(r_packed)) return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); @@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) return bch2_bkey_prev_filter(b, t, k, 1); } -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_cmp(k->p, m->p) < 0; - int cmp2 = bkey_cmp(bkey_start_pos(k), - bkey_start_pos(m)) > 0; - - return (cmp1 << 1) + cmp2; -} - /* Btree key iteration */ void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index fc76e78..8a4667b 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -149,7 +149,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, if (level) six_lock_pcpu_alloc(&b->c.lock); else - six_lock_pcpu_free(&b->c.lock); + six_lock_pcpu_free_rcu(&b->c.lock); mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); @@ -814,9 +814,9 @@ lock_node: EBUG_ON(b->c.btree_id != iter->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bkey_cmp(b->data->min_key, + bpos_cmp(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; @@ -897,9 +897,9 @@ lock_node: EBUG_ON(b->c.btree_id != btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bkey_cmp(b->data->min_key, + bpos_cmp(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); out: bch2_btree_cache_cannibalize_unlock(c); @@ -1011,7 +1011,7 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - if (bkey_cmp(bkey_successor(n1->key.k.p), + if (bpos_cmp(bpos_successor(n1->key.k.p), n2->data->min_key)) { char buf1[200], buf2[200]; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 6d5ed77..88c549c 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, struct bpos node_end = b->data->max_key; struct bpos expected_start = bkey_deleted(&prev->k->k) ? node_start - : bkey_successor(prev->k->k.p); + : bpos_successor(prev->k->k.p); char buf1[200], buf2[200]; bool update_min = false; bool update_max = false; @@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); } - if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, + if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " cur %s", @@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, } if (fsck_err_on(is_last && - bkey_cmp(cur.k->k.p, node_end), c, + bpos_cmp(cur.k->k.p, node_end), c, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", @@ -470,8 +470,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, k, &max_stale, true); @@ -560,13 +560,13 @@ static int bch2_gc_btree_init(struct bch_fs *c, return 0; six_lock_read(&b->c.lock, NULL, NULL); - if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, + if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, "btree root with incorrect min_key: %s", (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { BUG(); } - if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, + if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, "btree root with incorrect max_key: %s", (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { BUG(); @@ -1148,7 +1148,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { @@ -1171,6 +1173,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_btree_iter_advance(iter); } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -1271,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, /* Find a format that all keys in @old_nodes can pack into */ bch2_bkey_format_init(&format_state); + /* + * XXX: this won't correctly take it account the new min/max keys: + */ for (i = 0; i < nr_old_nodes; i++) __bch2_btree_calc_format(&format_state, old_nodes[i]); @@ -1333,7 +1339,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next_skip_noops(k, vstruct_last(s2))) { + k = bkey_next(k)) { last = k; u64s += k->u64s; } @@ -1362,7 +1368,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, n1->key.k.p = n1->data->max_key = bkey_unpack_pos(n1, last); - n2->data->min_key = bkey_successor(n1->data->max_key); + n2->data->min_key = bpos_successor(n1->data->max_key); memcpy_u64s(vstruct_last(s1), s2->start, u64s); @@ -1405,7 +1411,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, unsigned j; for (j = 0; j < nr_new_nodes; j++) - if (!bkey_cmp(old_nodes[i]->key.k.p, + if (!bpos_cmp(old_nodes[i]->key.k.p, new_nodes[j]->key.k.p)) goto next; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index c3d02f5..b1362a9 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - if (l.phase != r.phase) - return l.phase < r.phase ? -1 : 1; - if (bkey_cmp(l.pos, r.pos)) - return bkey_cmp(l.pos, r.pos); - if (l.level != r.level) - return l.level < r.level ? -1 : 1; - return 0; + return cmp_int(l.phase, r.phase) ?: + bpos_cmp(l.pos, r.pos) ?: + cmp_int(l.level, r.level); } static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 9b74e79..b43d446 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -32,13 +32,13 @@ static void verify_no_dups(struct btree *b, if (start == end) return; - for (p = start, k = bkey_next_skip_noops(start, end); + for (p = start, k = bkey_next(start); k != end; - p = k, k = bkey_next_skip_noops(k, end)) { + p = k, k = bkey_next(k)) { struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); } #endif } @@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; - k != vstruct_last(i); - k = bkey_next_skip_noops(k, vstruct_last(i))) + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) k->needs_whiteout = v; } @@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next_skip_noops(k, end); + n = bkey_next(k); if (!bkey_deleted(k)) { bkey_copy(out, k); @@ -614,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect level"); - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &bn->ptr; - - *p = swab64(*p); - } - if (!write) compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); @@ -633,14 +625,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, b->data->max_key = b->key.k.p; } - btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); } - btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); @@ -754,7 +746,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, } prev = k; - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } fsck_err: return ret; @@ -947,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bp.v->mem_ptr = 0; } - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } bch2_bset_build_aux_tree(b, b->set, false); @@ -1327,8 +1319,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) return -1; - ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: - validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + validate_bset(c, NULL, b, i, sectors, WRITE, false); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -1481,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, validate_before_checksum = true; /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change) + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) validate_before_checksum = true; /* if we're going to be encrypting, check metadata validity first: */ diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 16ce6df..9c14cd3 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) { if (version < bcachefs_metadata_version_inode_btree_change && btree_id == BTREE_ID_inodes) { @@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id, swap(f->field_offset[BKEY_FIELD_INODE], f->field_offset[BKEY_FIELD_OFFSET]); } + + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + u64 max_packed = + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 + : U32_MAX - max_packed; + } } static inline void compat_bpos(unsigned level, enum btree_id btree_id, @@ -220,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, { if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && write) - bn->min_key = bkey_predecessor(bn->min_key); + bn->min_key = bpos_nosnap_predecessor(bn->min_key); + + if (version < bcachefs_metadata_version_snapshot && + write) + bn->max_key.snapshot = 0; compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + if (version < bcachefs_metadata_version_snapshot && + !write) + bn->max_key.snapshot = U32_MAX; + if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && !write) - bn->min_key = bkey_successor(bn->min_key); + bn->min_key = bpos_nosnap_successor(bn->min_key); } #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 459d27c..8190e73 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -18,6 +18,36 @@ static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); + } else { + p = bpos_nosnap_successor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); + } else { + p = bpos_nosnap_predecessor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { return l < BTREE_MAX_DEPTH && @@ -30,20 +60,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_IS_EXTENTS) && bkey_cmp(pos, POS_MAX)) - pos = bkey_successor(pos); + pos = bkey_successor(iter, pos); return pos; } static inline bool btree_iter_pos_before_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(iter->real_pos, b->data->min_key) < 0; + return bpos_cmp(iter->real_pos, b->data->min_key) < 0; } static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(b->key.k.p, iter->real_pos) < 0; + return bpos_cmp(b->key.k.p, iter->real_pos) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -285,7 +315,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && - bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, btree_iter_type(linked))) <= 0) { deadlock_iter = linked; reason = 7; @@ -583,10 +613,24 @@ err: static void bch2_btree_iter_verify(struct btree_iter *iter) { + enum btree_iter_type type = btree_iter_type(iter); unsigned i; EBUG_ON(iter->btree_id >= BTREE_ID_NR); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type == BTREE_ITER_NODES && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type != BTREE_ITER_NODES && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + bch2_btree_iter_verify_locks(iter); for (i = 0; i < BTREE_MAX_DEPTH; i++) @@ -597,6 +641,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { enum btree_iter_type type = btree_iter_type(iter); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + BUG_ON((type == BTREE_ITER_KEYS || type == BTREE_ITER_CACHED) && (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || @@ -1384,7 +1431,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) if (!b) return NULL; - BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); iter->pos = iter->real_pos = b->key.k.p; @@ -1421,12 +1468,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (!b) return NULL; - if (bkey_cmp(iter->pos, b->key.k.p) < 0) { + if (bpos_cmp(iter->pos, b->key.k.p) < 0) { /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ - btree_iter_set_search_pos(iter, bkey_successor(iter->pos)); + btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); /* Unlock to avoid screwing up our lock invariants: */ btree_node_unlock(iter, iter->level); @@ -1453,7 +1500,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) { - int cmp = bkey_cmp(new_pos, iter->real_pos); + int cmp = bpos_cmp(new_pos, iter->real_pos); unsigned l = iter->level; if (!cmp) @@ -1497,10 +1544,10 @@ out: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = bkey_cmp(pos, POS_MAX) != 0; + bool ret = bpos_cmp(pos, POS_MAX) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(pos); + pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; } @@ -1508,10 +1555,10 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = bkey_cmp(pos, POS_MIN) != 0; + bool ret = bpos_cmp(pos, POS_MIN) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_predecessor(pos); + pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; } @@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { struct bpos next_pos = iter->l[0].b->key.k.p; - bool ret = bkey_cmp(next_pos, POS_MAX) != 0; + bool ret = bpos_cmp(next_pos, POS_MAX) != 0; /* * Typically, we don't want to modify iter->pos here, since that @@ -1527,7 +1574,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) * btree, in that case we want iter->pos to reflect that: */ if (ret) - btree_iter_set_search_pos(iter, bkey_successor(next_pos)); + btree_iter_set_search_pos(iter, bpos_successor(next_pos)); else bch2_btree_iter_set_pos(iter, POS_MAX); @@ -1537,10 +1584,10 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) { struct bpos next_pos = iter->l[0].b->data->min_key; - bool ret = bkey_cmp(next_pos, POS_MIN) != 0; + bool ret = bpos_cmp(next_pos, POS_MIN) != 0; if (ret) - btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); + btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); else bch2_btree_iter_set_pos(iter, POS_MIN); @@ -1586,13 +1633,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi k = btree_iter_level_peek(iter, &iter->l[0]); if (next_update && - bkey_cmp(next_update->k.p, iter->real_pos) <= 0) + bpos_cmp(next_update->k.p, iter->real_pos) <= 0) k = bkey_i_to_s_c(next_update); if (likely(k.k)) { if (bkey_deleted(k.k)) { btree_iter_set_search_pos(iter, - bkey_successor(k.k->p)); + bkey_successor(iter, k.k->p)); continue; } @@ -1731,7 +1778,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); + bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); } pos = iter->pos; @@ -1965,6 +2012,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, { struct btree_iter *iter, *best = NULL; + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; + /* We always want a fresh iterator for node iterators: */ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) goto alloc_iter; @@ -1999,11 +2054,14 @@ alloc_iter: if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && btree_node_type_is_extents(btree_id) && - !(flags & BTREE_ITER_NOT_EXTENTS)) + !(flags & BTREE_ITER_NOT_EXTENTS) && + !(flags & BTREE_ITER_ALL_SNAPSHOTS)) flags |= BTREE_ITER_IS_EXTENTS; iter->flags = flags; + iter->snapshot = pos.snapshot; + if (!(iter->flags & BTREE_ITER_INTENT)) bch2_btree_iter_downgrade(iter); else if (!iter->locks_want) @@ -2026,6 +2084,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, __bch2_trans_get_iter(trans, btree_id, pos, BTREE_ITER_NODES| BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| flags); unsigned i; @@ -2127,6 +2186,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->nr_updates2 = 0; trans->mem_top = 0; + trans->hooks = NULL; trans->extra_journal_entries = NULL; trans->extra_journal_entry_u64s = 0; @@ -2137,7 +2197,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) (void *) &trans->fs_usage_deltas->memset_start); } - bch2_trans_cond_resched(trans); + if (!(flags & TRANS_RESET_NOUNLOCK)) + bch2_trans_cond_resched(trans); if (!(flags & TRANS_RESET_NOTRAVERSE)) bch2_btree_iter_traverse_all(trans); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 8768f4c..7585f98 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *); static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + bkey_init(&iter->k); iter->k.p = iter->pos = new_pos; } @@ -303,6 +306,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr } #define TRANS_RESET_NOTRAVERSE (1 << 0) +#define TRANS_RESET_NOUNLOCK (1 << 1) void bch2_trans_reset(struct btree_trans *, unsigned); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 0b35456..04354f5 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct bkey_cached_key *key = arg->key; return cmp_int(ck->key.btree_id, key->btree_id) ?: - bkey_cmp(ck->key.pos, key->pos); + bpos_cmp(ck->key.pos, key->pos); } static const struct rhashtable_params bch2_btree_key_cache_params = { @@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c, bch2_btree_key_cache_params)); memset(&ck->key, ~0, sizeof(ck->key)); - c->nr_keys--; + atomic_long_dec(&c->nr_keys); } static void bkey_cached_free(struct btree_key_cache *bc, @@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c) { struct bkey_cached *ck; - list_for_each_entry_reverse(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) { - c->nr_freed--; - return ck; - } - ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { INIT_LIST_HEAD(&ck->list); @@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c) return ck; } - list_for_each_entry(ck, &c->clean, list) + return NULL; +} + +static struct bkey_cached * +bkey_cached_reuse(struct btree_key_cache *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct bkey_cached *ck; + unsigned i; + + mutex_lock(&c->lock); + list_for_each_entry_reverse(ck, &c->freed, list) if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); + c->nr_freed--; + list_del(&ck->list); + mutex_unlock(&c->lock); return ck; } + mutex_unlock(&c->lock); + + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); + rcu_read_unlock(); + return ck; + } + } + rcu_read_unlock(); return NULL; } @@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c, struct bpos pos) { struct bkey_cached *ck; + bool was_new = true; ck = bkey_cached_alloc(c); - if (!ck) - return ERR_PTR(-ENOMEM); + + if (unlikely(!ck)) { + ck = bkey_cached_reuse(c); + if (unlikely(!ck)) + return ERR_PTR(-ENOMEM); + + was_new = false; + } ck->c.level = 0; ck->c.btree_id = btree_id; @@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c, ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (rhashtable_lookup_insert_fast(&c->table, + if (unlikely(rhashtable_lookup_insert_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)) { + bch2_btree_key_cache_params))) { /* We raced with another fill: */ - bkey_cached_free(c, ck); + + if (likely(was_new)) { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { + mutex_lock(&c->lock); + bkey_cached_free(c, ck); + mutex_unlock(&c->lock); + } + return NULL; } - c->nr_keys++; + atomic_long_inc(&c->nr_keys); - list_move(&ck->list, &c->clean); six_unlock_write(&ck->c.lock); return ck; @@ -213,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) const struct btree_iter *iter = p; return ck->key.btree_id == iter->btree_id && - !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; + !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; } __flatten @@ -238,11 +276,8 @@ retry: return 0; } - mutex_lock(&c->btree_key_cache.lock); ck = btree_key_cache_create(&c->btree_key_cache, iter->btree_id, iter->pos); - mutex_unlock(&c->btree_key_cache.lock); - ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; @@ -257,7 +292,7 @@ retry: if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, bkey_cached_check_fn, iter, _THIS_IP_)) { if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { goto retry; } @@ -267,7 +302,7 @@ retry: } if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } @@ -370,15 +405,13 @@ err: bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); + BUG_ON(!btree_node_locked(c_iter, 0)); + if (!evict) { - mutex_lock(&c->btree_key_cache.lock); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty--; + atomic_long_dec(&c->btree_key_cache.nr_dirty); } - - list_move_tail(&ck->list, &c->btree_key_cache.clean); - mutex_unlock(&c->btree_key_cache.lock); } else { evict: BUG_ON(!btree_node_intent_locked(c_iter, 0)); @@ -388,13 +421,14 @@ evict: six_lock_write(&ck->c.lock, NULL, NULL); - mutex_lock(&c->btree_key_cache.lock); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty--; + atomic_long_dec(&c->btree_key_cache.nr_dirty); } bkey_cached_evict(&c->btree_key_cache, ck); + + mutex_lock(&c->btree_key_cache.lock); bkey_cached_free(&c->btree_key_cache, ck); mutex_unlock(&c->btree_key_cache.lock); } @@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - mutex_lock(&c->btree_key_cache.lock); - list_move(&ck->list, &c->btree_key_cache.dirty); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty++; + atomic_long_inc(&c->btree_key_cache.nr_dirty); if (bch2_nr_btree_keys_need_flush(c)) kick_reclaim = true; - - mutex_unlock(&c->btree_key_cache.lock); } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, @@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = container_of(shrink, struct bch_fs, btree_key_cache.shrink); struct btree_key_cache *bc = &c->btree_key_cache; + struct bucket_table *tbl; struct bkey_cached *ck, *t; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned flags; + unsigned start, flags; + int srcu_idx; /* Return -1 if we can't do anything right now */ if (sc->gfp_mask & __GFP_FS) @@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, else if (!mutex_trylock(&bc->lock)) return -1; + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); /* @@ -540,23 +572,40 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, if (scanned >= nr) goto out; - list_for_each_entry_safe(ck, t, &bc->clean, list) { - if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - else if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); - } + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + start = bc->shrink_iter; - scanned++; - if (scanned >= nr) { - if (&t->list != &bc->clean) - list_move_tail(&bc->clean, &t->list); - goto out; + do { + struct rhash_head *pos, *next; + + rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) + continue; + + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) + break; } - } + + bc->shrink_iter++; + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + } while (scanned < nr && bc->shrink_iter != start); + + rcu_read_unlock(); out: memalloc_nofs_restore(flags); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); mutex_unlock(&bc->lock); return freed; @@ -569,41 +618,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, btree_key_cache.shrink); struct btree_key_cache *bc = &c->btree_key_cache; - return bc->nr_keys; + return atomic_long_read(&bc->nr_keys); } void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bucket_table *tbl; struct bkey_cached *ck, *n; + struct rhash_head *pos; + unsigned i; if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); mutex_lock(&bc->lock); - list_splice(&bc->dirty, &bc->clean); - list_for_each_entry_safe(ck, n, &bc->clean, list) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed); + } + rcu_read_unlock(); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); bch2_journal_preres_put(&c->journal, &ck->res); - kfree(ck->k); list_del(&ck->list); + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); - bc->nr_keys--; } - BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); - BUG_ON(bc->nr_keys); - - list_for_each_entry_safe(ck, n, &bc->freed, list) { - cond_resched(); + BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal)); + BUG_ON(atomic_long_read(&bc->nr_keys)); - list_del(&ck->list); - kmem_cache_free(bch2_key_cache, ck); - } mutex_unlock(&bc->lock); if (bc->table_init_done) @@ -614,8 +667,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); INIT_LIST_HEAD(&c->freed); - INIT_LIST_HEAD(&c->clean); - INIT_LIST_HEAD(&c->dirty); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) @@ -641,8 +692,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); - pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); - pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); + pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys)); + pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty)); } void bch2_btree_key_cache_exit(void) diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 2f8b552..02715cd 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -3,8 +3,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) { - size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); - size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 1024 + nr_keys / 2; return max_t(ssize_t, 0, nr_dirty - max_dirty); @@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) { - size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); - size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; return nr_dirty > max_dirty && diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 5999044..1941616 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -216,6 +216,7 @@ enum btree_iter_type { #define BTREE_ITER_CACHED_NOFILL (1 << 9) #define BTREE_ITER_CACHED_NOCREATE (1 << 10) #define BTREE_ITER_NOT_EXTENTS (1 << 11) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -245,6 +246,8 @@ struct btree_iter { /* what we're searching for/what the iterator actually points to: */ struct bpos real_pos; struct bpos pos_after_commit; + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; u16 flags; u8 idx; @@ -292,13 +295,12 @@ struct btree_key_cache { struct rhashtable table; bool table_init_done; struct list_head freed; - struct list_head clean; - struct list_head dirty; struct shrinker shrink; + unsigned shrink_iter; size_t nr_freed; - size_t nr_keys; - size_t nr_dirty; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; }; struct bkey_cached_key { @@ -330,7 +332,7 @@ struct bkey_cached { struct btree_insert_entry { unsigned trigger_flags; u8 bkey_type; - u8 btree_id; + enum btree_id btree_id:8; u8 level; unsigned trans_triggers_run:1; unsigned is_extent:1; @@ -344,6 +346,14 @@ struct btree_insert_entry { #define BTREE_ITER_MAX 32 #endif +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + struct btree_trans { struct bch_fs *c; #ifdef CONFIG_BCACHEFS_DEBUG @@ -378,6 +388,7 @@ struct btree_trans { struct btree_insert_entry *updates2; /* update path: */ + struct btree_trans_commit_hook *hooks; struct jset_entry *extra_journal_entries; unsigned extra_journal_entry_u64s; struct journal_entry_pin *journal_pin; @@ -600,6 +611,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} + enum btree_trigger_flags { __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index a251380..4ce12ae 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, int bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_trigger_flags); +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); /** diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a661bc0..19dfc32 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - if (bkey_cmp(next_node, bp.v->min_key)) { + if (bpos_cmp(next_node, bp.v->min_key)) { bch2_dump_btree_node(c, b); panic("expected next min_key %s got %s\n", (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), @@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - if (bkey_cmp(k.k->p, b->key.k.p)) { + if (bpos_cmp(k.k->p, b->key.k.p)) { bch2_dump_btree_node(c, b); panic("expected end %s got %s\n", (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), @@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; } - next_node = bkey_successor(k.k->p); + next_node = bpos_successor(k.k->p); } #endif } @@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) struct bset_tree *t; struct bkey uk; - bch2_bkey_format_add_pos(s, b->data->min_key); - for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) { @@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b) struct bkey_format_state s; bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, b->data->min_key); + bch2_bkey_format_add_pos(&s, b->data->max_key); __bch2_btree_calc_format(&s, b); return bch2_bkey_format_done(&s); @@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b->data->flags = 0; SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); @@ -1095,10 +1094,12 @@ static struct btree *__btree_split_node(struct btree_update *as, struct btree *n1, struct btree_iter *iter) { + struct bkey_format_state s; size_t nr_packed = 0, nr_unpacked = 0; struct btree *n2; struct bset *set1, *set2; - struct bkey_packed *k, *prev = NULL; + struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; + struct bpos n1_pos; n2 = bch2_btree_node_alloc(as, n1->c.level); bch2_btree_update_add_new_node(as, n2); @@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as, SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); n2->key.k.p = n1->key.k.p; - btree_node_set_format(n2, n2->data->format); - set1 = btree_bset_first(n1); set2 = btree_bset_first(n2); @@ -1119,7 +1118,7 @@ static struct btree *__btree_split_node(struct btree_update *as, */ k = set1->start; while (1) { - struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); + struct bkey_packed *n = bkey_next(k); if (n == vstruct_last(set1)) break; @@ -1136,33 +1135,53 @@ static struct btree *__btree_split_node(struct btree_update *as, } BUG_ON(!prev); + set2_start = k; + set2_end = vstruct_last(set1); - btree_set_max(n1, bkey_unpack_pos(n1, prev)); - btree_set_min(n2, bkey_successor(n1->key.k.p)); - - set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); - set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); - + set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data); set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - - n2->nr.live_u64s = le16_to_cpu(set2->u64s); - n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); - n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; - n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; n1->nr.live_u64s = le16_to_cpu(set1->u64s); n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); n1->nr.packed_keys = nr_packed; n1->nr.unpacked_keys = nr_unpacked; + n1_pos = bkey_unpack_pos(n1, prev); + if (as->c->sb.version < bcachefs_metadata_version_snapshot) + n1_pos.snapshot = U32_MAX; + + btree_set_max(n1, n1_pos); + btree_set_min(n2, bpos_successor(n1->key.k.p)); + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, n2->data->min_key); + bch2_bkey_format_add_pos(&s, n2->data->max_key); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + struct bkey uk = bkey_unpack_key(n1, k); + bch2_bkey_format_add_key(&s, &uk); + } + + n2->data->format = bch2_bkey_format_done(&s); + btree_node_set_format(n2, n2->data->format); + + out = set2->start; + memset(&n2->nr, 0, sizeof(n2->nr)); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k) + ? &n1->format : &bch2_bkey_format_current, k)); + out->format = KEY_FORMAT_LOCAL_BTREE; + btree_keys_account_key_add(&n2->nr, 0, out); + out = bkey_next(out); + } + + set2->u64s = cpu_to_le16((u64 *) out - set2->_data); + set_btree_bset_end(n2, n2->set); + BUG_ON(!set1->u64s); BUG_ON(!set2->u64s); - memcpy_u64s(set2->start, - vstruct_end(set1), - le16_to_cpu(set2->u64s)); - btree_node_reset_sib_u64s(n1); btree_node_reset_sib_u64s(n2); @@ -1216,7 +1235,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, i = btree_bset_first(b); src = dst = i->start; while (src != vstruct_last(i)) { - n = bkey_next_skip_noops(src, vstruct_last(i)); + n = bkey_next(src); if (!bkey_deleted(src)) { memmove_u64s_down(dst, src, src->u64s); dst = bkey_next(dst); @@ -1563,8 +1582,10 @@ retry: } bch2_bkey_format_init(&new_s); - __bch2_btree_calc_format(&new_s, b); - __bch2_btree_calc_format(&new_s, m); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); + __bch2_btree_calc_format(&new_s, next); + bch2_bkey_format_add_pos(&new_s, next->data->max_key); new_f = bch2_bkey_format_done(&new_s); sib_u64s = btree_node_u64s_with_format(b, &new_f) + diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index d9308bd..67a2c65 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, { return cmp_int(l->btree_id, r->btree_id) ?: -cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + bpos_cmp(l->k->k.p, r->k->k.p); } static inline bool same_leaf_as_prev(struct btree_trans *trans, @@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); - EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->trans->c, b)); EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); @@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, { struct bch_fs *c = trans->c; - BUG_ON(bch2_debug_check_bkeys && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type)); - BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos)); + if (bch2_debug_check_bkeys) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + panic("invalid bkey %s on insert: %s\n", buf, invalid); + } + } + BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); BUG_ON(i->level != i->iter->level); BUG_ON(i->btree_id != i->iter->btree_id); } @@ -369,6 +377,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; + struct btree_trans_commit_hook *h; unsigned u64s = 0; bool marking = false; int ret; @@ -386,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + return ret; + h = h->next; + } + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) @@ -556,6 +573,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; + if (!(trans->flags & BTREE_INSERT_NOUNLOCK)) trans_for_each_update2(trans, i) if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && !same_leaf_as_prev(trans, i)) @@ -826,7 +844,7 @@ int __bch2_trans_commit(struct btree_trans *trans) struct btree_insert_entry *i = NULL; struct btree_iter *iter; bool trans_trigger_run; - unsigned u64s; + unsigned u64s, reset_flags = 0; int ret = 0; if (!trans->nr_updates) @@ -940,7 +958,11 @@ out: if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); out_reset: - bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); + if (!ret) + reset_flags |= TRANS_RESET_NOTRAVERSE; + if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) + reset_flags |= TRANS_RESET_NOUNLOCK; + bch2_trans_reset(trans, reset_flags); return ret; err: @@ -1053,6 +1075,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, return 0; } +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, struct bkey_i *k) { diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index c6d49f4..acf6003 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); k = bch2_btree_iter_peek(iter); while (k.k && !(err = bkey_err(k))) { @@ -273,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (err) return err; - if (!i->size || !bkey_cmp(POS_MAX, i->from)) + if (!i->size || !bpos_cmp(POS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); @@ -289,8 +291,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, * can't easily correctly restart a btree node traversal across * all nodes, meh */ - i->from = bkey_cmp(POS_MAX, b->key.k.p) - ? bkey_successor(b->key.k.p) + i->from = bpos_cmp(POS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) : b->key.k.p; if (!i->size) diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 592dd80..cf4ce2e 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, int bch2_dirent_create(struct btree_trans *trans, u64 dir_inum, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - int flags) + u64 *dir_offset, int flags) { struct bkey_i_dirent *dirent; int ret; @@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans, if (ret) return ret; - return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; } static void dirent_copy_target(struct bkey_i_dirent *dst, @@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, int bch2_dirent_rename(struct btree_trans *trans, u64 src_dir, struct bch_hash_info *src_hash, u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, - const struct qstr *dst_name, u64 *dst_inum, + const struct qstr *src_name, u64 *src_inum, u64 *src_offset, + const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { struct btree_iter *src_iter = NULL, *dst_iter = NULL; @@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans, new_dst->k.p = src_iter->pos; bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); - goto out; + goto out_set_offset; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to @@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans, bch2_trans_update(trans, src_iter, &new_src->k_i, 0); bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); +out_set_offset: + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; out: bch2_trans_iter_put(trans, src_iter); bch2_trans_iter_put(trans, dst_iter); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 3476937..e1d8ce3 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_create(struct btree_trans *, u64, const struct bch_hash_info *, u8, - const struct qstr *, u64, int); + const struct qstr *, u64, u64 *, int); int bch2_dirent_delete_at(struct btree_trans *, const struct bch_hash_info *, @@ -46,8 +46,8 @@ enum bch_rename_mode { int bch2_dirent_rename(struct btree_trans *, u64, struct bch_hash_info *, u64, struct bch_hash_info *, - const struct qstr *, u64 *, - const struct qstr *, u64 *, + const struct qstr *, u64 *, u64 *, + const struct qstr *, u64 *, u64 *, enum bch_rename_mode); struct btree_iter * diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 1dba7e9..f712f68 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, if (ret) break; } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index a7e0408..b07d395 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - if (bp.v->min_key.snapshot) + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) return "invalid min_key.snapshot"; return bch2_bkey_ptrs_invalid(c, k); @@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, btree_node_type_is_extents(btree_id) && bkey_cmp(bp.v->min_key, POS_MIN)) bp.v->min_key = write - ? bkey_predecessor(bp.v->min_key) - : bkey_successor(bp.v->min_key); + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } /* KEY_TYPE_extent: */ diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index c8069df..ccee43a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_cmp(k->p, m->p) < 0; + int cmp2 = bkey_cmp(bkey_start_pos(k), + bkey_start_pos(m)) > 0; + + return (cmp1 << 1) + cmp2; +} + int bch2_cut_front_s(struct bpos, struct bkey_s); int bch2_cut_back_s(struct bpos, struct bkey_s); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 503ce19..83c2168 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, { struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL; + struct btree_iter *inode_iter = NULL; struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); @@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, new_inode); + inode_iter = bch2_inode_create(trans, new_inode); + ret = PTR_ERR_OR_ZERO(inode_iter); if (ret) goto err; @@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ret = bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(new_inode->bi_mode), name, new_inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; } + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, inode_iter, new_inode); err: + bch2_trans_iter_put(trans, inode_iter); bch2_trans_iter_put(trans, dir_iter); return ret; } @@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 inum, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); @@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); + inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ret = PTR_ERR_OR_ZERO(dir_iter); if (ret) @@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum, BCH_HASH_SET_MUST_CREATE) ?: - bch2_inode_write(trans, dir_iter, dir_u) ?: + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + inode_u->bi_dir = dir_inum; + inode_u->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); err: bch2_trans_iter_put(trans, dir_iter); @@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(trans->c); + u64 inum, now = bch2_current_time(c); struct bkey_s_c k; int ret; @@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, name, BTREE_ITER_INTENT); @@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans, const struct qstr *dst_name, enum bch_rename_mode mode) { + struct bch_fs *c = trans->c; struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + u64 src_inode, src_offset, dst_inode, dst_offset; + u64 now = bch2_current_time(c); int ret; src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, @@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - src_hash = bch2_hash_info_init(trans->c, src_dir_u); + src_hash = bch2_hash_info_init(c, src_dir_u); if (dst_dir != src_dir) { dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, @@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + dst_hash = bch2_hash_info_init(c, dst_dir_u); } else { dst_dir_u = src_dir_u; dst_hash = src_hash; @@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, - dst_name, &dst_inode, + src_name, &src_inode, &src_offset, + dst_name, &dst_inode, &dst_offset, mode); if (ret) goto err; @@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; + + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } + } + if (mode == BCH_RENAME_OVERWRITE) { if (S_ISDIR(src_inode_u->bi_mode) != S_ISDIR(dst_inode_u->bi_mode)) { diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 9dc162f..62788ae 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -675,6 +675,39 @@ retry: continue; } + if (!target.bi_nlink && + !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (target.bi_dir != k.k->p.inode || + target.bi_dir_offset != k.k->p.offset) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode %llu has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + d_inum, + target.bi_dir, + target.bi_dir_offset, + k.k->p.inode, + k.k->p.offset) || + c->opts.version_upgrade)) { + struct bkey_inode_buf p; + + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + bch2_trans_unlock(&trans); + + bch2_inode_pack(c, &p, &target); + + ret = bch2_btree_insert(c, BTREE_ID_inodes, + &p.inode.k_i, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error in fsck: error %i updating inode", ret); + goto err; + } + continue; + } + if (fsck_err_on(have_target && d.v->d_type != mode_to_type(target.bi_mode), c, @@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (!S_ISDIR(u.bi_mode) && + u.bi_nlink && + !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || + c->opts.version_upgrade)) { + u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + do_update = true; + } + if (do_update) { struct bkey_inode_buf p; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 4559e77..f1665ca 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(trans->c, inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -469,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -int bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u) +struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u) { struct bch_fs *c = trans->c; - struct bkey_inode_buf *inode_p; struct btree_iter *iter = NULL; struct bkey_s_c k; u64 min, max, start, *hint; @@ -493,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans, if (start >= max || start < min) start = min; - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); again: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { @@ -520,7 +516,7 @@ again: bch2_trans_iter_put(trans, iter); if (ret) - return ret; + return ERR_PTR(ret); if (start != min) { /* Retry from start */ @@ -528,15 +524,12 @@ again: goto again; } - return -ENOSPC; + return ERR_PTR(-ENOSPC); found_slot: *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - - ret = bch2_inode_write(trans, iter, inode_u); - bch2_trans_iter_put(trans, iter); - return ret; + return iter; } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 1caf036..6bad6df 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); +struct btree_iter *bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *); int bch2_inode_rm(struct bch_fs *, u64, bool); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 284d398..36b10cb 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta || new_i_size) { bch2_inode_pack(trans->c, &inode_p, &inode_u); + + inode_p.inode.k.p.snapshot = iter->snapshot; + bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); } @@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op) k = bch2_keylist_front(keys); + k->k.p.snapshot = iter->snapshot; + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); bkey_copy(sk.k, k); bch2_cut_front(iter->pos, sk.k); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1f26139..69c553a 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) if (dynamic_fault("bcachefs:add:journal_alloc")) return -ENOMEM; + /* 1/128th of the device by default: */ + nr = ca->mi.nbuckets >> 7; + /* - * clamp journal size to 1024 buckets or 512MB (in sectors), whichever + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever * is smaller: */ - nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, + nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, - min(1 << 10, - (1 << 20) / ca->mi.bucket_size)); + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 54f2e20..c7fa03c 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl) if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; - if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change) + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) validate_before_checksum = true; if (validate_before_checksum && diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index bbf8e5a..4a5b50e 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, - c->btree_key_cache.nr_dirty, - c->btree_key_cache.nr_keys); + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index f863fd7..3d1bf87 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id l_btree_id, { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: - bkey_cmp(l_pos, r->k->k.p)); + bpos_cmp(l_pos, r->k->k.p)); } static int journal_key_cmp(struct journal_key *l, struct journal_key *r) { return (cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p)); + bpos_cmp(l->k->k.p, r->k->k.p)); } static size_t journal_key_search(struct journal_keys *journal_keys, @@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign if (iter->idx > idx || (iter->idx == idx && biter->last && - bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) + bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) iter->idx++; } @@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); if (btree_k.k && journal_k.k) { - int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p); if (!cmp) bch2_journal_iter_advance_btree(iter); @@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * ret = iter->last == journal ? journal_k : btree_k; if (iter->b && - bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { + bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) { iter->journal.idx = iter->journal.keys->nr; iter->last = none; return bkey_s_c_null; @@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) return cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p) ?: + bpos_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) while (src + 1 < keys.d + keys.nr && src[0].btree_id == src[1].btree_id && src[0].level == src[1].level && - !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) src++; *dst++ = *src++; @@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(r->level, l->level) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + bpos_cmp(l->k->k.p, r->k->k.p); } static int bch2_journal_replay(struct bch_fs *c, @@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } + if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { + bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); + ret = -EINVAL; + goto err; + + } + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { bch_info(c, "alloc_v2 feature bit not set, fsck required"); c->opts.fsck = true; @@ -1338,6 +1345,7 @@ int bch2_fs_initialize(struct bch_fs *c) S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; bch2_inode_pack(c, &packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_inodes, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index dfb12fd..7507b6b 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr) goto err; } err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) goto err; } err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -113,7 +115,7 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; int ret = 0; @@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) BUG_ON(i); err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -166,7 +169,7 @@ err: static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; int ret = 0; @@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) BUG_ON(i); err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != i); i += 2; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr * 2); @@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } + bch2_trans_iter_put(&trans, iter); err: bch2_trans_exit(&trans); return ret; @@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) BUG_ON(k.k->size != 8); i += 16; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr); @@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } + bch2_trans_iter_put(&trans, iter); err: bch2_trans_exit(&trans); return 0; @@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return 0; } @@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return 0; } @@ -473,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; ret = __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); @@ -508,7 +519,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -549,7 +560,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -630,6 +641,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) if (++i == nr) break; } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } @@ -645,6 +658,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr) for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ; + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } @@ -671,6 +686,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) break; } } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } diff --git a/linux/rhashtable.c b/linux/rhashtable.c index 351eac7..ba2196f 100644 --- a/linux/rhashtable.c +++ b/linux/rhashtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * @@ -8,27 +9,29 @@ * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include -#include #include #include #include +#include #include #include #include +#include #include #include #include +#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 32UL + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht, return rht_head_hashfn(ht, tbl, he, ht->p); } -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, - gfp_t gfp) -{ - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) - nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); - size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); - /* Never allocate more than 0.5 locks per bucket */ - size = min_t(unsigned int, size, tbl->size >> 1); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif - if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} - if (!tbl->locks) - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); - if (!tbl->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&tbl->locks[i]); +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); } - tbl->locks_mask = size - 1; - return 0; + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); } static void bucket_table_free(struct bucket_table *tbl) { - if (tbl) - kvfree(tbl->locks); + if (tbl->nest) + nested_bucket_table_free(tbl); kvfree(tbl); } @@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t size; int i; - size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || - gfp != GFP_KERNEL) - tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); - if (tbl == NULL) - return NULL; + tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); - tbl->size = nbuckets; + size = nbuckets; - if (alloc_bucket_locks(ht, tbl, gfp) < 0) { - bucket_table_free(tbl); - return NULL; + if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; } + if (tbl == NULL) + return NULL; + + tbl->size = size; + + rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) - INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } @@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, return new_tbl; } -static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct bucket_table *new_tbl = rhashtable_last_table(ht, - rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; - spinlock_t *new_bucket_lock; + struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; - rht_for_each(entry, old_tbl, old_hash) { + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) new_hash = head_hashfn(ht, new_tbl, entry); - new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + rht_lock(new_tbl, &new_tbl->buckets[new_hash]); - spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); - head = rht_dereference_bucket(new_tbl->buckets[new_hash], - new_tbl, new_hash); + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); - rcu_assign_pointer(new_tbl->buckets[new_hash], entry); - spin_unlock(new_bucket_lock); + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry); - rcu_assign_pointer(*pprev, next); + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - spinlock_t *old_bucket_lock; + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + int err; - old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); + if (!bkt) + return 0; + rht_lock(old_tbl, bkt); - spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; - old_tbl->rehash++; - spin_unlock_bh(old_bucket_lock); + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { - /* Protect future_tbl using the first bucket lock. */ - spin_lock_bh(old_tbl->locks); - - /* Did somebody beat us to it? */ - if (rcu_access_pointer(old_tbl->future_tbl)) { - spin_unlock_bh(old_tbl->locks); - return -EEXIST; - } - /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). */ - rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - spin_unlock_bh(old_tbl->locks); + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; return 0; } @@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht) spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; - spin_unlock(&ht->lock); /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; - old_tbl = rhashtable_last_table(ht, old_tbl); + ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht) return err; } +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; - err = rhashtable_rehash_table(ht); + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } mutex_unlock(&ht->mutex); @@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, err = -ENOMEM; - new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; @@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, fail: /* Do not fail the insert if someone else did a rehash. */ - if (likely(rcu_dereference_raw(tbl->future_tbl))) + if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ @@ -373,57 +480,485 @@ fail: return err; } -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + if (!ht->rhlist) + return rht_obj(ht, head); - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); - err = 0; + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; - rcu_assign_pointer(tbl->buckets[hash], obj); + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); - if (err == 0) +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) return NULL; - else if (err == -EAGAIN) - return tbl; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; else - return ERR_PTR(err); + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { - return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - (unsigned long)params->min_size); + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) @@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) return jhash2(key, length, seed); } +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; - size = HASH_DEFAULT_SIZE; - if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; - if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) - return -EINVAL; - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht, if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); - if (params->max_size) - ht->p.max_size = rounddown_pow_of_two(params->max_size); + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; - if (params->insecure_max_entries) - ht->p.insecure_max_entries = - rounddown_pow_of_two(params->insecure_max_entries); - else - ht->p.insecure_max_entries = ht->p.max_size * 2; - - ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } - if (params->nelem_hint) - size = rounded_hashtable_size(&ht->p); - - /* The maximum (not average) chain length grows with the - * size of the hash table, at a rate of (log N)/(log log N). - * The value of 16 is selected so that even if the hash - * table grew to 2^32 you would not expect the maximum - * chain length to exceed it unless we are under attack - * (or extremely unlucky). - * - * As this limit is only to detect attacks, we don't need - * to set it to a lower value as you'd need the chain - * length to vastly exceed 16 to have any real effect - * on the system. - */ - if (!params->insecure_elasticity) - ht->elasticity = 16; + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); - if (params->locks_mul) - ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); - else - ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; + size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { @@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht, } } + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (tbl == NULL) - return -ENOMEM; + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } atomic_set(&ht->nelems, 0); @@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht, return 0; } +EXPORT_SYMBOL_GPL(rhashtable_init); -void rhashtable_destroy(struct rhashtable *ht) +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { - struct bucket_table *tbl; + int err; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } mutex_unlock(&ht->mutex); } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/linux/six.c b/linux/six.c index 5328004..fe72189 100644 --- a/linux/six.c +++ b/linux/six.c @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef DEBUG #define EBUG_ON(cond) BUG_ON(cond) @@ -309,6 +310,9 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, wake_up_process(p); } + if (ret) + six_acquire(&lock->dep_map, 1); + return ret; } @@ -560,6 +564,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) lock->readers) { smp_mb(); /* unlock barrier */ this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ state.v = READ_ONCE(lock->state.v); } else { EBUG_ON(!(lock->state.v & l[type].held_mask)); @@ -705,6 +710,34 @@ void six_lock_wakeup_all(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); +struct free_pcpu_rcu { + struct rcu_head rcu; + void __percpu *p; +}; + +static void free_pcpu_rcu_fn(struct rcu_head *_rcu) +{ + struct free_pcpu_rcu *rcu = + container_of(_rcu, struct free_pcpu_rcu, rcu); + + free_percpu(rcu->p); + kfree(rcu); +} + +void six_lock_pcpu_free_rcu(struct six_lock *lock) +{ + struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); + + if (!rcu) + return; + + rcu->p = lock->readers; + lock->readers = NULL; + + call_rcu(&rcu->rcu, free_pcpu_rcu_fn); +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); + void six_lock_pcpu_free(struct six_lock *lock) { BUG_ON(lock->readers && pcpu_read_count(lock)); @@ -717,8 +750,6 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_free); void six_lock_pcpu_alloc(struct six_lock *lock) { - BUG_ON(lock->readers && pcpu_read_count(lock)); - BUG_ON(lock->state.read_lock); #ifdef __KERNEL__ if (!lock->readers) lock->readers = alloc_percpu(unsigned); -- 2.39.2