]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 18686af684 bcachefs: Inode backpointers
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 28 Mar 2021 21:38:28 +0000 (17:38 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 29 Mar 2021 04:22:38 +0000 (00:22 -0400)
51 files changed:
.bcachefs_revision
Makefile
cmd_debug.c
include/linux/list_nulls.h [new file with mode: 0644]
include/linux/overflow.h [new file with mode: 0644]
include/linux/poison.h [new file with mode: 0644]
include/linux/random.h
include/linux/rcupdate.h
include/linux/rhashtable-types.h [new file with mode: 0644]
include/linux/rhashtable.h
include/linux/six.h
include/linux/slab.h
include/linux/types.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_sort.c
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-common.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/recovery.c
libbcachefs/tests.c
linux/rhashtable.c
linux/six.c

index 976139a36bdd3f422a9af7f0039c7d38b2600771..385c19f6011e37294a39d6e408fbaff01de1ad6a 100644 (file)
@@ -1 +1 @@
-ad68801b939cdda0530f54cd07b3212e98fe1d75
+18686af68412ebfad9c2adc6ee976ffdb9e1b886
index 6999b93ac373f7d4b631829a3cc3ebfd4ed989d0..3fe9604896513eed03b429965cca1bf4202f1c23 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -156,6 +156,10 @@ update-bcachefs-sources:
        git add linux/six.c
        cp $(LINUX_DIR)/include/linux/six.h include/linux/
        git add include/linux/six.h
+       cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+       git add include/linux/list_nulls.h
+       cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+       git add include/linux/poison.h
        $(RM) libbcachefs/*.mod.c
        git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
        git add .bcachefs_revision
index 3baa69788e2dab0bb233d3f85cb09eb70bf96da5..4938ec07e3f49f409bd372af88834f91720ea7ab 100644 (file)
@@ -323,9 +323,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
                        le64_to_cpu(i->journal_seq));
                offset += sectors;
 
-               for (k = i->start;
-                    k != vstruct_last(i);
-                    k = bkey_next_skip_noops(k, vstruct_last(i))) {
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
                        struct bkey u;
                        char buf[4096];
 
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644 (file)
index 0000000..fa6e847
--- /dev/null
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+       struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+       struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+       ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+       ({ typeof(ptr) ____ptr = (ptr); \
+          !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+       })
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+       return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+       return ((unsigned long)ptr) >> 1;
+}
+
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+       return !h->pprev;
+}
+
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+       return !READ_ONCE(h->pprev);
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+       return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+                                       struct hlist_nulls_head *h)
+{
+       struct hlist_nulls_node *first = h->first;
+
+       n->next = first;
+       WRITE_ONCE(n->pprev, &h->first);
+       h->first = n;
+       if (!is_a_nulls(first))
+               WRITE_ONCE(first->pprev, &n->next);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+       struct hlist_nulls_node *next = n->next;
+       struct hlist_nulls_node **pprev = n->pprev;
+
+       WRITE_ONCE(*pprev, next);
+       if (!is_a_nulls(next))
+               WRITE_ONCE(next->pprev, pprev);
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+       __hlist_nulls_del(n);
+       WRITE_ONCE(n->pprev, LIST_POISON2);
+}
+
+/**
+ * hlist_nulls_for_each_entry  - iterate over list of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)                   \
+       for (pos = (head)->first;                                              \
+            (!is_a_nulls(pos)) &&                                             \
+               ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @member:    the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)     \
+       for (; (!is_a_nulls(pos)) &&                            \
+               ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+#endif
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644 (file)
index 0000000..ef74051
--- /dev/null
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+       return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_add_overflow(__a, __b, __d);  \
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_sub_overflow(__a, __b, __d);  \
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_mul_overflow(__a, __b, __d);  \
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({    \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = __a + __b;                       \
+       *__d < __a;                             \
+})
+#define __unsigned_sub_overflow(a, b, d) ({    \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = __a - __b;                       \
+       __a < __b;                              \
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({            \
+       typeof(a) __a = (a);                            \
+       typeof(b) __b = (b);                            \
+       typeof(d) __d = (d);                            \
+       (void) (&__a == &__b);                          \
+       (void) (&__a == __d);                           \
+       *__d = __a * __b;                               \
+       __builtin_constant_p(__b) ?                     \
+         __b > 0 && __a > type_max(typeof(__a)) / __b : \
+         __a > 0 && __b > type_max(typeof(__b)) / __a;  \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({      \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = (u64)__a + (u64)__b;             \
+       (((~(__a ^ __b)) & (*__d ^ __a))        \
+               & type_min(typeof(__a))) != 0;  \
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({      \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = (u64)__a - (u64)__b;             \
+       ((((__a ^ __b)) & (*__d ^ __a))         \
+               & type_min(typeof(__a))) != 0;  \
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({                              \
+       typeof(a) __a = (a);                                            \
+       typeof(b) __b = (b);                                            \
+       typeof(d) __d = (d);                                            \
+       typeof(a) __tmax = type_max(typeof(a));                         \
+       typeof(a) __tmin = type_min(typeof(a));                         \
+       (void) (&__a == &__b);                                          \
+       (void) (&__a == __d);                                           \
+       *__d = (u64)__a * (u64)__b;                                     \
+       (__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||        \
+       (__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+       (__b == (typeof(__b))-1 && __a == __tmin);                      \
+})
+
+
+#define check_add_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_add_overflow(a, b, d),                 \
+                       __unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_sub_overflow(a, b, d),                 \
+                       __unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_mul_overflow(a, b, d),                 \
+                       __unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({           \
+       typeof(a) _a = a;                                               \
+       typeof(s) _s = s;                                               \
+       typeof(d) _d = d;                                               \
+       u64 _a_full = _a;                                               \
+       unsigned int _to_shift =                                        \
+               is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;    \
+       *_d = (_a_full << _to_shift);                                   \
+       (_to_shift != _s || is_negative(*_d) || is_negative(_a) ||      \
+       (*_d >> _to_shift) != _a);                                      \
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+       if (check_mul_overflow(bytes, c, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+       if (check_add_overflow(bytes, c, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count)                                  \
+       __ab_c_size(count,                                              \
+                   sizeof(*(p)->member) + __must_be_array((p)->member),\
+                   sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ *                     within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count)                              \
+       array_size(count,                                               \
+                   sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644 (file)
index 0000000..dc8ae5d
--- /dev/null
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+#define TIMER_ENTRY_STATIC     ((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/page_poison.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING   ((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define        RED_INACTIVE    0x09F911029D74E35BULL   /* when obj is inactive */
+#define        RED_ACTIVE      0xD84156C5635688C0ULL   /* when obj is active */
+
+#define SLUB_RED_INACTIVE      0xbb
+#define SLUB_RED_ACTIVE                0xcc
+
+/* ...and for poisoning */
+#define        POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
+#define POISON_FREE    0x6b    /* for use-after-free poisoning */
+#define        POISON_END      0xa5    /* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM    0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE                0x5b
+#define JBD2_POISON_FREE       0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define        POOL_POISON_FREED       0xa7    /* !inuse */
+#define        POOL_POISON_ALLOCATED   0xa9    /* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE                0x12
+#define ATM_POISON             0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT       0x11
+#define MUTEX_DEBUG_FREE       0x22
+#define MUTEX_POISON_WW_CTX    ((void *) 0x500 + POISON_POINTER_DELTA)
+
+/********** security/ **********/
+#define KEY_DESTROY            0xbd
+
+#endif
index c38ae46d021f3a90b95460bb4b1b4b8954fa051a..28c595a0c0cfe3429ff69c19fa83edb424d7d38c 100644 (file)
@@ -45,6 +45,7 @@ static inline type get_random_##type(void)            \
 
 get_random_type(int);
 get_random_type(long);
+get_random_type(u32);
 get_random_type(u64);
 
 #endif /* _LINUX_RANDOM_H */
index c99d78a897ac8a03a29b831dbf2964484ee62177..ae292241c82c5f8ce361443d1f53a3e91e4737a0 100644 (file)
 
 #define RCU_INIT_POINTER(p, v)         WRITE_ONCE(p, v)
 
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure.  Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+       rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+                       void (*f)(struct rcu_head *head))
+{
+       void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+       if (func == f)
+               return true;
+       return false;
+}
+
 #endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644 (file)
index 0000000..57467cb
--- /dev/null
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+       struct rhash_head __rcu         *next;
+};
+
+struct rhlist_head {
+       struct rhash_head               rhead;
+       struct rhlist_head __rcu        *next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+       struct rhashtable *ht;
+       const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+                              const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+       u16                     nelem_hint;
+       u16                     key_len;
+       u16                     key_offset;
+       u16                     head_offset;
+       unsigned int            max_size;
+       u16                     min_size;
+       bool                    automatic_shrinking;
+       rht_hashfn_t            hashfn;
+       rht_obj_hashfn_t        obj_hashfn;
+       rht_obj_cmpfn_t         obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+       struct bucket_table __rcu       *tbl;
+       unsigned int                    key_len;
+       unsigned int                    max_elems;
+       struct rhashtable_params        p;
+       bool                            rhlist;
+       struct work_struct              run_work;
+       struct mutex                    mutex;
+       spinlock_t                      lock;
+       atomic_t                        nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+       struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+       struct list_head list;
+       struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+       struct rhashtable *ht;
+       struct rhash_head *p;
+       struct rhlist_head *list;
+       struct rhashtable_walker walker;
+       unsigned int slot;
+       unsigned int skip;
+       bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+                   const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+                 const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
index 8dbe1533d54cb9cfa1523a93c09217c6d3df5de3..6cf8c2571160cc8019aead4af8b8dff5f2f11914 100644 (file)
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/list_nulls.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
-#define RHT_BASE_BITS          4
-#define RHT_HASH_BITS          27
-#define RHT_BASE_SHIFT         RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE        (RHT_BASE_BITS + 1)
+#define BIT(nr)                        (1UL << (nr))
 
-struct rhash_head {
-       struct rhash_head __rcu         *next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket.  This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain.  To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket.  This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful.  This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
 
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY 16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
        unsigned int            size;
-       unsigned int            rehash;
+       unsigned int            nest;
        u32                     hash_rnd;
-       unsigned int            locks_mask;
-       spinlock_t              *locks;
        struct list_head        walkers;
        struct rcu_head         rcu;
 
        struct bucket_table __rcu *future_tbl;
 
-       struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
-};
-
-struct rhashtable_compare_arg {
-       struct rhashtable *ht;
-       const void *key;
+       struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-                              const void *obj);
-
-struct rhashtable_params {
-       size_t                  nelem_hint;
-       size_t                  key_len;
-       size_t                  key_offset;
-       size_t                  head_offset;
-       unsigned int            insecure_max_entries;
-       unsigned int            max_size;
-       unsigned int            min_size;
-       u32                     nulls_base;
-       bool                    insecure_elasticity;
-       bool                    automatic_shrinking;
-       size_t                  locks_mul;
-       rht_hashfn_t            hashfn;
-       rht_obj_hashfn_t        obj_hashfn;
-       rht_obj_cmpfn_t         obj_cmpfn;
-};
-
-struct rhashtable {
-       struct bucket_table __rcu       *tbl;
-       atomic_t                        nelems;
-       unsigned int                    key_len;
-       unsigned int                    elasticity;
-       struct rhashtable_params        p;
-       struct work_struct              run_work;
-       struct mutex                    mutex;
-       spinlock_t                      lock;
-};
-
-struct rhashtable_walker {
-       struct list_head list;
-       struct bucket_table *tbl;
-};
-
-#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-       return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-       ((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define        RHT_NULLS_MARKER(ptr)   \
+       ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr)       \
+       ((ptr) = NULL)
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
@@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
 {
-       return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+       return hash & (tbl->size - 1);
 }
 
-static inline unsigned int rht_key_hashfn(
-       struct rhashtable *ht, const struct bucket_table *tbl,
-       const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+       const void *key, const struct rhashtable_params params,
+       unsigned int hash_rnd)
 {
        unsigned int hash;
 
        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
-               hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+               hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;
 
                if (params.hashfn)
-                       hash = params.hashfn(key, key_len, tbl->hash_rnd);
+                       hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
-                       hash = jhash(key, key_len, tbl->hash_rnd);
+                       hash = jhash(key, key_len, hash_rnd);
                else
-                       hash = jhash2(key, key_len / sizeof(u32),
-                                     tbl->hash_rnd);
+                       hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;
 
                if (params.hashfn)
-                       hash = params.hashfn(key, key_len, tbl->hash_rnd);
+                       hash = params.hashfn(key, key_len, hash_rnd);
                else
-                       hash = jhash(key, key_len, tbl->hash_rnd);
+                       hash = jhash(key, key_len, hash_rnd);
        }
 
+       return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+       struct rhashtable *ht, const struct bucket_table *tbl,
+       const void *key, const struct rhashtable_params params)
+{
+       unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
        return rht_bucket_index(tbl, hash);
 }
 
@@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn(
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
 }
 
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
 {
@@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht,
               (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
 {
@@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
               tbl->size > ht->p.min_size;
 }
 
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
 {
@@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
                (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
 {
-       return ht->p.insecure_max_entries &&
-              atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+       return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-                                         unsigned int hash)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-       return &tbl->locks[hash & tbl->locks_mask];
+       return 1;
 }
 
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
-                                           const void *,
-                                           struct rhash_head *,
-                                           struct bucket_table *);
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+                                            u32 hash)
+{
+       return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+                            struct rhash_head *obj);
 
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_walk_enter(struct rhashtable *ht,
+                          struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
 
-#define rht_dereference(p, ht)                 rcu_dereference(p)
-#define rht_dereference_rcu(p, ht)             rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash)   rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+       (void)rhashtable_walk_start_check(iter);
+}
+
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+                                void (*free_fn)(void *ptr, void *arg),
+                                void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+       rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+       rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+       rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+       rcu_dereference(p)
 
 #define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-#define rht_for_each_continue(pos, head, tbl, hash) \
-       for (pos = rht_dereference_bucket(head, tbl, hash); \
-            !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+       struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers.  The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+                           struct rhash_lock_head __rcu **bkt)
+{
+       bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+                                  struct rhash_lock_head __rcu **bucket,
+                                  unsigned int subclass)
+{
+       bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+                             struct rhash_lock_head __rcu **bkt)
+{
+       bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+       struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+       return (struct rhash_head *)
+               ((unsigned long)p & ~BIT(0) ?:
+                (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
+ *   rht_ptr_exclusive() dereferences in a context where exclusive
+ *            access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+       struct rhash_lock_head __rcu *const *bkt)
+{
+       return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+       struct rhash_lock_head __rcu *const *bkt,
+       struct bucket_table *tbl,
+       unsigned int hash)
+{
+       return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+       struct rhash_lock_head __rcu *const *bkt)
+{
+       return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+                                    struct rhash_head *obj)
+{
+       if (rht_is_a_nulls(obj))
+               obj = NULL;
+       rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+                                    struct rhash_lock_head __rcu **bkt,
+                                    struct rhash_head *obj)
+{
+       if (rht_is_a_nulls(obj))
+               obj = NULL;
+       rcu_assign_pointer(*bkt, (void *)obj);
+       preempt_enable();
+       __release(bitlock);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+       for (pos = head;                        \
+            !rht_is_a_nulls(pos);              \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ */
 #define rht_for_each(pos, tbl, hash) \
-       rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+       rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
+                         tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)    \
+       for (pos = head;                                                \
+            (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);    \
+            pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)                        \
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)               \
+       rht_for_each_entry_from(tpos, pos,                              \
+                               rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+                               tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @next:      the &struct rhash_head to use as next in loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)          \
+       for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                 \
+            next = !rht_is_a_nulls(pos) ?                                    \
+                      rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+            (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);          \
+            pos = next,                                                      \
+            next = !rht_is_a_nulls(pos) ?                                    \
+                      rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash)                    \
        for (({barrier(); }),                                           \
-            pos = rht_dereference_bucket_rcu(head, tbl, hash);         \
+            pos = head;                                                \
             !rht_is_a_nulls(pos);                                      \
             pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_rcu(pos, tbl, hash)                               \
-       rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash)                       \
+       for (({barrier(); }),                                   \
+            pos = rht_ptr_rcu(rht_bucket(tbl, hash));          \
+            !rht_is_a_nulls(pos);                              \
+            pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                               \
-            pos = rht_dereference_bucket_rcu(head, tbl, hash);             \
+            pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)           \
-       rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
-                                       tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)              \
+       rht_for_each_entry_rcu_from(tpos, pos,                             \
+                                   rht_ptr_rcu(rht_bucket(tbl, hash)),    \
+                                   tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos:       the &struct rlist_head to use as a loop cursor.
+ * @list:      the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list)                                    \
+       for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rlist_head to use as a loop cursor.
+ * @list:      the head of the list
+ * @member:    name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
+       for (pos = list; pos && rht_entry(tpos, pos, member);           \
+            pos = rcu_dereference_raw(pos->next))
 
 static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
@@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
 {
@@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast(
                .ht = ht,
                .key = key,
        };
-       const struct bucket_table *tbl;
+       struct rhash_lock_head __rcu *const *bkt;
+       struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;
 
-       rcu_read_lock();
-
        tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
-       rht_for_each_rcu(he, tbl, hash) {
-               if (params.obj_cmpfn ?
-                   params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-                   rhashtable_compare(&arg, rht_obj(ht, he)))
-                       continue;
-               rcu_read_unlock();
-               return rht_obj(ht, he);
-       }
+       bkt = rht_bucket(tbl, hash);
+       do {
+               rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+                       if (params.obj_cmpfn ?
+                           params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+                           rhashtable_compare(&arg, rht_obj(ht, he)))
+                               continue;
+                       return he;
+               }
+               /* An object might have been moved to a different hash chain,
+                * while we walk along it - better check and retry.
+                */
+       } while (he != RHT_NULLS_MARKER(bkt));
 
        /* Ensure we see any new tables. */
        smp_rmb();
@@ -286,149 +607,593 @@ restart:
        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;
-       rcu_read_unlock();
 
        return NULL;
 }
 
-static inline int __rhashtable_insert_fast(
-       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:                hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+       struct rhashtable *ht, const void *key,
+       const struct rhashtable_params params)
+{
+       struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+       return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:                hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+       struct rhashtable *ht, const void *key,
+       const struct rhashtable_params params)
+{
+       void *obj;
+
+       rcu_read_lock();
+       obj = rhashtable_lookup(ht, key, params);
+       rcu_read_unlock();
+
+       return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt:       hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key.  All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+       struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
+{
+       struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+       return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params, bool rhlist)
 {
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
-       struct bucket_table *tbl, *new_tbl;
+       struct rhash_lock_head __rcu **bkt;
+       struct rhash_head __rcu **pprev;
+       struct bucket_table *tbl;
        struct rhash_head *head;
-       spinlock_t *lock;
-       unsigned int elasticity;
        unsigned int hash;
-       int err;
+       int elasticity;
+       void *data;
 
-restart:
        rcu_read_lock();
 
        tbl = rht_dereference_rcu(ht->tbl, ht);
+       hash = rht_head_hashfn(ht, tbl, obj, params);
+       elasticity = RHT_ELASTICITY;
+       bkt = rht_bucket_insert(ht, tbl, hash);
+       data = ERR_PTR(-ENOMEM);
+       if (!bkt)
+               goto out;
+       pprev = NULL;
+       rht_lock(tbl, bkt);
 
-       /* All insertions must grab the oldest table containing
-        * the hashed bucket that is yet to be rehashed.
-        */
-       for (;;) {
-               hash = rht_head_hashfn(ht, tbl, obj, params);
-               lock = rht_bucket_lock(tbl, hash);
-               spin_lock_bh(lock);
+       if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+               rht_unlock(tbl, bkt);
+               rcu_read_unlock();
+               return rhashtable_insert_slow(ht, key, obj);
+       }
 
-               if (tbl->rehash <= hash)
-                       break;
+       rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *plist;
+               struct rhlist_head *list;
 
-               spin_unlock_bh(lock);
-               tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-       }
+               elasticity--;
+               if (!key ||
+                   (params.obj_cmpfn ?
+                    params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+                    rhashtable_compare(&arg, rht_obj(ht, head)))) {
+                       pprev = &head->next;
+                       continue;
+               }
 
-       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-       if (unlikely(new_tbl)) {
-               tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
-               if (!IS_ERR_OR_NULL(tbl))
-                       goto slow_path;
+               data = rht_obj(ht, head);
 
-               err = PTR_ERR(tbl);
-               goto out;
-       }
+               if (!rhlist)
+                       goto out_unlock;
 
-       err = -E2BIG;
-       if (unlikely(rht_grow_above_max(ht, tbl)))
-               goto out;
 
-       if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
-               spin_unlock_bh(lock);
-               err = rhashtable_insert_rehash(ht, tbl);
-               rcu_read_unlock();
-               if (err)
-                       return err;
+               list = container_of(obj, struct rhlist_head, rhead);
+               plist = container_of(head, struct rhlist_head, rhead);
 
-               goto restart;
+               RCU_INIT_POINTER(list->next, plist);
+               head = rht_dereference_bucket(head->next, tbl, hash);
+               RCU_INIT_POINTER(list->rhead.next, head);
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj);
+                       rht_unlock(tbl, bkt);
+               } else
+                       rht_assign_unlock(tbl, bkt, obj);
+               data = NULL;
+               goto out;
        }
 
-       err = -EEXIST;
-       elasticity = ht->elasticity;
-       rht_for_each(head, tbl, hash) {
-               if (key &&
-                   unlikely(!(params.obj_cmpfn ?
-                              params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-                              rhashtable_compare(&arg, rht_obj(ht, head)))))
-                       goto out;
-               if (!--elasticity)
-                       goto slow_path;
-       }
+       if (elasticity <= 0)
+               goto slow_path;
+
+       data = ERR_PTR(-E2BIG);
+       if (unlikely(rht_grow_above_max(ht, tbl)))
+               goto out_unlock;
 
-       err = 0;
+       if (unlikely(rht_grow_above_100(ht, tbl)))
+               goto slow_path;
 
-       head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+       /* Inserting at head of list makes unlocking free. */
+       head = rht_ptr(bkt, tbl, hash);
 
        RCU_INIT_POINTER(obj->next, head);
+       if (rhlist) {
+               struct rhlist_head *list;
 
-       rcu_assign_pointer(tbl->buckets[hash], obj);
+               list = container_of(obj, struct rhlist_head, rhead);
+               RCU_INIT_POINTER(list->next, NULL);
+       }
 
        atomic_inc(&ht->nelems);
+       rht_assign_unlock(tbl, bkt, obj);
+
        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);
 
+       data = NULL;
 out:
-       spin_unlock_bh(lock);
        rcu_read_unlock();
 
-       return err;
+       return data;
+
+out_unlock:
+       rht_unlock(tbl, bkt);
+       goto out;
 }
 
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       void *ret;
+
+       ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt:       hash list table
+ * @key:       the pointer to the key
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+       struct rhltable *hlt, const void *key, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+                                               params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt:       hash list table
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+       struct rhltable *hlt, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+       key += params.key_offset;
+
+       return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
 static inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
 {
        const char *key = rht_obj(ht, obj);
+       void *ret;
 
        BUG_ON(ht->p.obj_hashfn);
 
-       return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-                                       params);
+       ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+                                      false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
 }
 
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       const char *key = rht_obj(ht, obj);
+
+       BUG_ON(ht->p.obj_hashfn);
+
+       return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+                                       false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ *                               with explicit key
+ * @ht:                hash table
+ * @key:       key
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       void *ret;
+
+       BUG_ON(!ht->p.obj_hashfn || !key);
+
+       ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:                hash table
+ * @key:       key
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       BUG_ON(!ht->p.obj_hashfn || !key);
+
+       return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
-       struct rhash_head *obj, const struct rhashtable_params params)
+       struct rhash_head *obj, const struct rhashtable_params params,
+       bool rhlist)
 {
+       struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
-       spinlock_t * lock;
        unsigned int hash;
        int err = -ENOENT;
 
        hash = rht_head_hashfn(ht, tbl, obj, params);
-       lock = rht_bucket_lock(tbl, hash);
+       bkt = rht_bucket_var(tbl, hash);
+       if (!bkt)
+               return -ENOENT;
+       pprev = NULL;
+       rht_lock(tbl, bkt);
 
-       spin_lock_bh(lock);
+       rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *list;
+
+               list = container_of(he, struct rhlist_head, rhead);
 
-       pprev = &tbl->buckets[hash];
-       rht_for_each(he, tbl, hash) {
                if (he != obj) {
+                       struct rhlist_head __rcu **lpprev;
+
                        pprev = &he->next;
-                       continue;
+
+                       if (!rhlist)
+                               continue;
+
+                       do {
+                               lpprev = &list->next;
+                               list = rht_dereference_bucket(list->next,
+                                                             tbl, hash);
+                       } while (list && obj != &list->rhead);
+
+                       if (!list)
+                               continue;
+
+                       list = rht_dereference_bucket(list->next, tbl, hash);
+                       RCU_INIT_POINTER(*lpprev, list);
+                       err = 0;
+                       break;
                }
 
-               rcu_assign_pointer(*pprev, obj->next);
+               obj = rht_dereference_bucket(obj->next, tbl, hash);
+               err = 1;
+
+               if (rhlist) {
+                       list = rht_dereference_bucket(list->next, tbl, hash);
+                       if (list) {
+                               RCU_INIT_POINTER(list->rhead.next, obj);
+                               obj = &list->rhead;
+                               err = 0;
+                       }
+               }
+
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj);
+                       rht_unlock(tbl, bkt);
+               } else {
+                       rht_assign_unlock(tbl, bkt, obj);
+               }
+               goto unlocked;
+       }
+
+       rht_unlock(tbl, bkt);
+unlocked:
+       if (err > 0) {
+               atomic_dec(&ht->nelems);
+               if (unlikely(ht->p.automatic_shrinking &&
+                            rht_shrink_below_30(ht, tbl)))
+                       schedule_work(&ht->run_work);
                err = 0;
-               break;
        }
 
-       spin_unlock_bh(lock);
+       return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params, bool rhlist)
+{
+       struct bucket_table *tbl;
+       int err;
+
+       rcu_read_lock();
+
+       tbl = rht_dereference_rcu(ht->tbl, ht);
+
+       /* Because we have already taken (and released) the bucket
+        * lock in old_tbl, if we find that future_tbl is not yet
+        * visible then that guarantees the entry to still be in
+        * the old tbl if it exists.
+        */
+       while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+                                                  rhlist)) &&
+              (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+               ;
+
+       rcu_read_unlock();
 
        return err;
 }
 
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
 static inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
+{
+       return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt:       hash list table
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+       struct rhltable *hlt, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+       struct rhashtable *ht, struct bucket_table *tbl,
+       struct rhash_head *obj_old, struct rhash_head *obj_new,
+       const struct rhashtable_params params)
+{
+       struct rhash_lock_head __rcu **bkt;
+       struct rhash_head __rcu **pprev;
+       struct rhash_head *he;
+       unsigned int hash;
+       int err = -ENOENT;
+
+       /* Minimally, the old and new objects must have same hash
+        * (which should mean identifiers are the same).
+        */
+       hash = rht_head_hashfn(ht, tbl, obj_old, params);
+       if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+               return -EINVAL;
+
+       bkt = rht_bucket_var(tbl, hash);
+       if (!bkt)
+               return -ENOENT;
+
+       pprev = NULL;
+       rht_lock(tbl, bkt);
+
+       rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               if (he != obj_old) {
+                       pprev = &he->next;
+                       continue;
+               }
+
+               rcu_assign_pointer(obj_new->next, obj_old->next);
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj_new);
+                       rht_unlock(tbl, bkt);
+               } else {
+                       rht_assign_unlock(tbl, bkt, obj_new);
+               }
+               err = 0;
+               goto unlocked;
+       }
+
+       rht_unlock(tbl, bkt);
+
+unlocked:
+       return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:                hash table
+ * @obj_old:   pointer to hash head inside object being replaced
+ * @obj_new:   pointer to hash head inside object which is new
+ * @params:    hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+       struct rhashtable *ht, struct rhash_head *obj_old,
+       struct rhash_head *obj_new,
+       const struct rhashtable_params params)
 {
        struct bucket_table *tbl;
        int err;
@@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast(
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
-       while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+       while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+                                               obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;
 
-       if (err)
-               goto out;
-
-       atomic_dec(&ht->nelems);
-       if (unlikely(ht->p.automatic_shrinking &&
-                    rht_shrink_below_30(ht, tbl)))
-               schedule_work(&ht->run_work);
-
-out:
        rcu_read_unlock();
 
        return err;
 }
 
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt:       Table to walk over
+ * @iter:      Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+                                      struct rhashtable_iter *iter)
+{
+       return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt:       the hash list table to destroy
+ * @free_fn:   callback to release resources of element
+ * @arg:       pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+                                            void (*free_fn)(void *ptr,
+                                                            void *arg),
+                                            void *arg)
+{
+       return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+       return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
index 0e6df059341ff0c6c50680b8b9116905df70efd5..477c33eb00d7dca36ad07cde09cd5681455cf6b2 100644 (file)
@@ -196,6 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
+void six_lock_pcpu_free_rcu(struct six_lock *);
 void six_lock_pcpu_free(struct six_lock *);
 void six_lock_pcpu_alloc(struct six_lock *);
 
index b8a1235b105a064a0fa71968e01120620ef10492..775b7e3aa7829a0608455626f97c82ee41e4781a 100644 (file)
@@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 #define kzfree(p)                      free(p)
 
 #define kvmalloc(size, flags)          kmalloc(size, flags)
+#define kvzalloc(size, flags)          kzalloc(size, flags)
 #define kvfree(p)                      kfree(p)
 
 static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
index 1e1255508cd96a680b7a2f642b271662481aecf5..c9886cbaadcc891f6f1dcdd3369ca415b1cbeaf7 100644 (file)
@@ -11,6 +11,8 @@
 #define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
 #include <asm/types.h>
 
+#include <linux/cache.h>
+
 #define BITS_PER_LONG  __BITS_PER_LONG
 
 struct page;
index 532f23b9c5e51bd1b9b46308ecf08b0fbcdee4f8..cb22595161a591a65b0418f0fa204ff9ce203bcb 100644 (file)
@@ -138,19 +138,18 @@ struct bpos {
 #define KEY_SNAPSHOT_MAX               ((__u32)~0U)
 #define KEY_SIZE_MAX                   ((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-       struct bpos ret;
-
-       ret.inode       = inode;
-       ret.offset      = offset;
-       ret.snapshot    = 0;
-
-       return ret;
+       return (struct bpos) {
+               .inode          = inode,
+               .offset         = offset,
+               .snapshot       = snapshot,
+       };
 }
 
-#define POS_MIN                                POS(0, 0)
-#define POS_MAX                                POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN                                SPOS(0, 0, 0)
+#define POS_MAX                                SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)           SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -707,7 +706,9 @@ struct bch_inode_generation {
        x(bi_foreground_target,         16)     \
        x(bi_background_target,         16)     \
        x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)
+       x(bi_fields_set,                16)     \
+       x(bi_dir,                       64)     \
+       x(bi_dir_offset,                64)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
@@ -743,6 +744,7 @@ enum {
        __BCH_INODE_I_SIZE_DIRTY= 5,
        __BCH_INODE_I_SECTORS_DIRTY= 6,
        __BCH_INODE_UNLINKED    = 7,
+       __BCH_INODE_BACKPTR_UNTRUSTED = 8,
 
        /* bits 20+ reserved for packed fields below: */
 };
@@ -755,6 +757,7 @@ enum {
 #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
 #define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
 
 LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
@@ -1204,7 +1207,9 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_new_versioning        = 10,
        bcachefs_metadata_version_bkey_renumber         = 10,
        bcachefs_metadata_version_inode_btree_change    = 11,
-       bcachefs_metadata_version_max                   = 12,
+       bcachefs_metadata_version_snapshot              = 12,
+       bcachefs_metadata_version_inode_backpointers    = 13,
+       bcachefs_metadata_version_max                   = 14,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1736,7 +1741,7 @@ struct btree_node {
        /* Closed interval: */
        struct bpos             min_key;
        struct bpos             max_key;
-       struct bch_extent_ptr   ptr;
+       struct bch_extent_ptr   _ptr; /* not used anymore */
        struct bkey_format      format;
 
        union {
index e1906f257ef2044e10eeec49caa94582ccf42d35..3af56062601f62d17a3ba597f0737225ad715956 100644 (file)
@@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
                return "incorrect number of fields";
 
        for (i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
                u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-               if (f->bits_per_field[i] > 64)
+               if (f->bits_per_field[i] > unpacked_bits)
                        return "field too large";
 
-               if (field_offset &&
-                   (f->bits_per_field[i] == 64 ||
-                   (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-                    field_offset)))
+               if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+                       return "offset + bits overflow";
+
+               if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+                    unpacked_mask) <
+                   field_offset)
                        return "offset + bits overflow";
 
                bits += f->bits_per_field[i];
@@ -1045,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
                              high_word(f, r),
                              b->nr_key_bits);
 
-       EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+       EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
                                bkey_unpack_pos(b, r)));
        return ret;
 }
@@ -1055,7 +1059,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
                                               const struct bkey_packed *l,
                                               const struct bpos *r)
 {
-       return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+       return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
 }
 
 __pure __flatten
@@ -1076,7 +1080,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
                r = (void*) &unpacked;
        }
 
-       return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+       return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
 }
 
 __pure __flatten
@@ -1087,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
        const struct bkey *l_unpacked;
 
        return unlikely(l_unpacked = packed_to_bkey_c(l))
-               ? bkey_cmp(l_unpacked->p, *r)
+               ? bpos_cmp(l_unpacked->p, *r)
                : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
@@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void)
        struct bkey_packed p;
 
        struct bkey_format test_format = {
-               .key_u64s       = 2,
+               .key_u64s       = 3,
                .nr_fields      = BKEY_NR_FIELDS,
                .bits_per_field = {
                        13,
                        64,
+                       32,
                },
        };
 
index 629288a60926a4b3b9dc0d25e9e8ce7e5c0fdfb1..2e45d88fab0382cdc9e99e9d5449702adc8f30d0 100644 (file)
@@ -33,16 +33,6 @@ struct bkey_s {
 
 #define bkey_next(_k)          vstruct_next(_k)
 
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
-                                                      struct bkey_packed *end)
-{
-       k = bkey_next(k);
-
-       while (k != end && !k->u64s)
-               k = (void *) ((u64 *) k + 1);
-       return k;
-}
-
 #define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -150,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
        return bkey_cmp_left_packed(b, l, &r);
 }
 
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+       return  cmp_int(l.inode,    r.inode) ?:
+               cmp_int(l.offset,   r.offset) ?:
+               cmp_int(l.snapshot, r.snapshot);
+}
+
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
-       if (l.inode != r.inode)
-               return l.inode < r.inode ? -1 : 1;
-       if (l.offset != r.offset)
-               return l.offset < r.offset ? -1 : 1;
-       if (l.snapshot != r.snapshot)
-               return l.snapshot < r.snapshot ? -1 : 1;
-       return 0;
+       return  cmp_int(l.inode,    r.inode) ?:
+               cmp_int(l.offset,   r.offset);
 }
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-       return bkey_cmp(l, r) < 0 ? l : r;
+       return bpos_cmp(l, r) < 0 ? l : r;
 }
 
 static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 {
-       return bkey_cmp(l, r) > 0 ? l : r;
+       return bpos_cmp(l, r) > 0 ? l : r;
 }
 
 #define sbb(a, b, borrow)                              \
@@ -200,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
 
 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
 {
-       if (bkey_cmp(l, r) > 0)
+       if (bpos_cmp(l, r) > 0)
                swap(l, r);
 
        return bpos_sub(r, l);
@@ -262,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
                format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       if (!++p.snapshot &&
+           !++p.offset &&
+           !++p.inode)
+               BUG();
 
-       if (!++ret.offset)
-               BUG_ON(!++ret.inode);
+       return p;
+}
 
-       return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+       if (!p.snapshot-- &&
+           !p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       p.snapshot = 0;
 
-       if (!ret.offset--)
-               BUG_ON(!ret.inode--);
+       if (!++p.offset &&
+           !++p.inode)
+               BUG();
 
-       return ret;
+       return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+       p.snapshot = 0;
+
+       if (!p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
index 641169ef91b5c1b795bb5722ee97ffb1ed81db07..6fe95b802e130060b48caa6ad54b2f2c38620c9b 100644 (file)
@@ -119,9 +119,16 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                        return "nonzero size field";
        }
 
-       if (k.k->p.snapshot)
+       if (type != BKEY_TYPE_btree &&
+           !btree_type_has_snapshots(type) &&
+           k.k->p.snapshot)
                return "nonzero snapshot";
 
+       if (type != BKEY_TYPE_btree &&
+           btree_type_has_snapshots(type) &&
+           k.k->p.snapshot != U32_MAX)
+               return "invalid snapshot field";
+
        if (type != BKEY_TYPE_btree &&
            !bkey_cmp(k.k->p, POS_MAX))
                return "POS_MAX key";
@@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-       if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+       if (bpos_cmp(k.k->p, b->data->min_key) < 0)
                return "key before start of btree node";
 
-       if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+       if (bpos_cmp(k.k->p, b->data->max_key) > 0)
                return "key past end of btree node";
 
        return NULL;
@@ -165,9 +172,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-       if (!bkey_cmp(pos, POS_MIN))
+       if (!bpos_cmp(pos, POS_MIN))
                pr_buf(out, "POS_MIN");
-       else if (!bkey_cmp(pos, POS_MAX))
+       else if (!bpos_cmp(pos, POS_MAX))
                pr_buf(out, "POS_MAX");
        else {
                if (pos.inode == U64_MAX)
@@ -256,7 +263,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
            !ops->key_merge ||
            l.k->type != r.k->type ||
            bversion_cmp(l.k->version, r.k->version) ||
-           bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+           bpos_cmp(l.k->p, bkey_start_pos(r.k)))
                return BCH_MERGE_NOMERGE;
 
        ret = ops->key_merge(c, l, r);
@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
        const struct bkey_ops *ops;
        struct bkey uk;
        struct bkey_s u;
+       unsigned nr_compat = 5;
        int i;
 
        /*
         * Do these operations in reverse order in the write path:
         */
 
-       for (i = 0; i < 4; i++)
-       switch (!write ? i : 3 - i) {
+       for (i = 0; i < nr_compat; i++)
+       switch (!write ? i : nr_compat - 1 - i) {
        case 0:
                if (big_endian != CPU_BIG_ENDIAN)
                        bch2_bkey_swab_key(f, k);
@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                }
                break;
        case 3:
+               if (version < bcachefs_metadata_version_snapshot &&
+                   (level || btree_type_has_snapshots(btree_id))) {
+                       struct bkey_i *u = packed_to_bkey(k);
+
+                       if (u) {
+                               u->k.p.snapshot = write
+                                       ? 0 : U32_MAX;
+                       } else {
+                               u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+                               u64 max_packed = min_packed +
+                                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+                               uk = __bch2_bkey_unpack_key(f, k);
+                               uk.p.snapshot = write
+                                       ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+                               BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+                       }
+               }
+
+               break;
+       case 4:
                if (!bkey_packed(k)) {
                        u = bkey_i_to_s(packed_to_bkey(k));
                } else {
index f2507079ed117318061a611ed354378abe0bfc7b..537ab7919e886eec958e49e12e6b18962172e725 100644 (file)
@@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 
        BUG_ON(!iter->used);
 
-       i->k = bkey_next_skip_noops(i->k, i->end);
+       i->k = bkey_next(i->k);
 
        BUG_ON(i->k > i->end);
 
index 87f951e14061c6c77a757ba0c5a408106e9f7b70..3fb9a9ed130e9760c3540def745a3abbc98961d5 100644 (file)
@@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
        for (_k = i->start;
             _k < vstruct_last(i);
             _k = _n) {
-               _n = bkey_next_skip_noops(_k, vstruct_last(i));
+               _n = bkey_next(_k);
 
                k = bkey_disassemble(b, _k, &uk);
                if (c)
@@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 
                n = bkey_unpack_key(b, _n);
 
-               if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+               if (bpos_cmp(n.p, k.k->p) < 0) {
                        printk(KERN_ERR "Key skipped backwards\n");
                        continue;
                }
 
                if (!bkey_deleted(k.k) &&
-                   !bkey_cmp(n.p, k.k->p))
+                   !bpos_cmp(n.p, k.k->p))
                        printk(KERN_ERR "Duplicate keys\n");
        }
 }
@@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
        goto start;
        while (1) {
                if (rw_aux_to_bkey(b, t, j) == k) {
-                       BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+                       BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
                                        bkey_unpack_pos(b, k)));
 start:
                        if (++j == t->size)
@@ -544,7 +544,7 @@ start:
                               rw_aux_tree(b, t)[j - 1].offset);
                }
 
-               k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+               k = bkey_next(k);
                BUG_ON(k >= btree_bkey_last(b, t));
        }
 }
@@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 
        if (is_power_of_2(j) &&
            !min_key->u64s) {
-               k = (void *) min_key;
-               bkey_init(&k->k);
-               k->k.p = b->data->min_key;
+               if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+                       k = (void *) min_key;
+                       bkey_init(&k->k);
+                       k->k.p = b->data->min_key;
+               }
        }
 
        if (is_power_of_2(j + 1) &&
            !max_key->u64s) {
-               k = (void *) max_key;
-               bkey_init(&k->k);
-               k->k.p = t->max_key;
+               if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+                       k = (void *) max_key;
+                       bkey_init(&k->k);
+                       k->k.p = t->max_key;
+               }
        }
 
        __make_bfloat(b, t, j, min_key, max_key);
@@ -759,7 +763,7 @@ retry:
        /* First we figure out where the first key in each cacheline is */
        eytzinger1_for_each(j, t->size) {
                while (bkey_to_cacheline(b, t, k) < cacheline)
-                       prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+                       prev = k, k = bkey_next(k);
 
                if (k >= btree_bkey_last(b, t)) {
                        /* XXX: this path sucks */
@@ -776,14 +780,19 @@ retry:
        }
 
        while (k != btree_bkey_last(b, t))
-               prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+               prev = k, k = bkey_next(k);
 
        t->max_key = bkey_unpack_pos(b, prev);
 
-       bkey_init(&min_key.k);
-       min_key.k.p = b->data->min_key;
-       bkey_init(&max_key.k);
-       max_key.k.p = t->max_key;
+       if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+               bkey_init(&min_key.k);
+               min_key.k.p = b->data->min_key;
+       }
+
+       if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+               bkey_init(&max_key.k);
+               max_key.k.p = t->max_key;
+       }
 
        /* Then we build the tree */
        eytzinger1_for_each(j, t->size)
@@ -911,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
        struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
        while ((p = __bkey_prev(b, t, k)) && !ret) {
-               for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+               for (i = p; i != k; i = bkey_next(i))
                        if (i->type >= min_key_type)
                                ret = i;
 
@@ -922,10 +931,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
                BUG_ON(ret >= orig_k);
 
                for (i = ret
-                       ? bkey_next_skip_noops(ret, orig_k)
+                       ? bkey_next(ret)
                        : btree_bkey_first(b, t);
                     i != orig_k;
-                    i = bkey_next_skip_noops(i, orig_k))
+                    i = bkey_next(i))
                        BUG_ON(i->type >= min_key_type);
        }
 
@@ -960,7 +969,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
        /* signal to make_bfloat() that they're uninitialized: */
        min_key.u64s = max_key.u64s = 0;
 
-       if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
+       if (bkey_next(k) == btree_bkey_last(b, t)) {
                t->max_key = bkey_unpack_pos(b, k);
 
                for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1084,7 +1093,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
                struct bkey_packed *k = start;
 
                while (1) {
-                       k = bkey_next_skip_noops(k, end);
+                       k = bkey_next(k);
                        if (k == end)
                                break;
 
@@ -1170,15 +1179,14 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
                                struct bset_tree *t,
-                               struct bpos *search,
-                               const struct bkey_packed *packed_search)
+                               struct bpos *search)
 {
        unsigned l = 0, r = t->size;
 
        while (l + 1 != r) {
                unsigned m = (l + r) >> 1;
 
-               if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+               if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
                        l = m;
                else
                        r = m;
@@ -1238,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
                        prefetch(&base->f[n << 4]);
 
                f = &base->f[n];
-
-               if (!unlikely(packed_search))
-                       goto slowpath;
                if (unlikely(f->exponent >= BFLOAT_FAILED))
                        goto slowpath;
 
@@ -1304,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
        case BSET_NO_AUX_TREE:
                return btree_bkey_first(b, t);
        case BSET_RW_AUX_TREE:
-               return bset_search_write_set(b, t, search, lossy_packed_search);
+               return bset_search_write_set(b, t, search);
        case BSET_RO_AUX_TREE:
                /*
                 * Each node in the auxiliary search tree covers a certain range
@@ -1313,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
                 * start and end - handle that here:
                 */
 
-               if (bkey_cmp(*search, t->max_key) > 0)
+               if (bpos_cmp(*search, t->max_key) > 0)
                        return btree_bkey_last(b, t);
 
                return bset_search_tree(b, t, search, lossy_packed_search);
@@ -1334,12 +1339,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_cmp_p_or_unp(b, m,
                                        lossy_packed_search, search) < 0)
-                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+                       m = bkey_next(m);
 
        if (!packed_search)
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_pos_cmp(b, m, search) < 0)
-                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+                       m = bkey_next(m);
 
        if (bch2_expensive_debug_checks) {
                struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1403,16 +1408,15 @@ noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
                              struct btree *b, struct bpos *search)
 {
-       struct bset_tree *t;
+       struct bkey_packed *k;
 
        trace_bkey_pack_pos_fail(search);
 
-       for_each_bset(b, t)
-               __bch2_btree_node_iter_push(iter, b,
-                       bch2_bset_search(b, t, search, NULL, NULL),
-                       btree_bkey_last(b, t));
+       bch2_btree_node_iter_init_from_start(iter, b);
 
-       bch2_btree_node_iter_sort(iter, b);
+       while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+              bkey_iter_pos_cmp(b, k, search) < 0)
+               bch2_btree_node_iter_advance(iter, b);
 }
 
 /**
@@ -1446,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
@@ -1464,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
        struct bkey_packed *k[MAX_BSETS];
        unsigned i;
 
-       EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+       EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
        bset_aux_tree_verify(b);
 
        memset(iter, 0, sizeof(*iter));
index 54b364c8f28c98f79c2e9375113cb3b4204b8664..506da4e0c91127295554f3432cfdd8782de24e2a 100644 (file)
@@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define bset_tree_for_each_key(_b, _t, _k)                             \
        for (_k = btree_bkey_first(_b, _t);                             \
             _k != btree_bkey_last(_b, _t);                             \
-            _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+            _k = bkey_next(_k))
 
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
@@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
        EBUG_ON(r_packed && !bkey_packed(r_packed));
 
        if (unlikely(!bkey_packed(l)))
-               return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+               return bpos_cmp(packed_to_bkey_c(l)->p, *r);
 
        if (likely(r_packed))
                return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
        return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
-enum bch_extent_overlap {
-       BCH_EXTENT_OVERLAP_ALL          = 0,
-       BCH_EXTENT_OVERLAP_BACK         = 1,
-       BCH_EXTENT_OVERLAP_FRONT        = 2,
-       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-                                                         const struct bkey *m)
-{
-       int cmp1 = bkey_cmp(k->p, m->p) < 0;
-       int cmp2 = bkey_cmp(bkey_start_pos(k),
-                           bkey_start_pos(m)) > 0;
-
-       return (cmp1 << 1) + cmp2;
-}
-
 /* Btree key iteration */
 
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
index fc76e78806f93f43cd21cdcbfde380305ae4a356..8a4667ba6b189e41d39aea23edec58ebf3ad33a1 100644 (file)
@@ -149,7 +149,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
        if (level)
                six_lock_pcpu_alloc(&b->c.lock);
        else
-               six_lock_pcpu_free(&b->c.lock);
+               six_lock_pcpu_free_rcu(&b->c.lock);
 
        mutex_lock(&bc->lock);
        ret = __bch2_btree_node_hash_insert(bc, b);
@@ -814,9 +814,9 @@ lock_node:
 
        EBUG_ON(b->c.btree_id != iter->btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
        EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               bkey_cmp(b->data->min_key,
+               bpos_cmp(b->data->min_key,
                         bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
@@ -897,9 +897,9 @@ lock_node:
 
        EBUG_ON(b->c.btree_id != btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
        EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               bkey_cmp(b->data->min_key,
+               bpos_cmp(b->data->min_key,
                         bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 out:
        bch2_btree_cache_cannibalize_unlock(c);
@@ -1011,7 +1011,7 @@ out:
                if (sib != btree_prev_sib)
                        swap(n1, n2);
 
-               if (bkey_cmp(bkey_successor(n1->key.k.p),
+               if (bpos_cmp(bpos_successor(n1->key.k.p),
                             n2->data->min_key)) {
                        char buf1[200], buf2[200];
 
index 6d5ed774d226a2ed95f0ecee7c6a425438c2ee1d..88c549c4eeef297aec25b96b26b760f866e67632 100644 (file)
@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
        struct bpos node_end    = b->data->max_key;
        struct bpos expected_start = bkey_deleted(&prev->k->k)
                ? node_start
-               : bkey_successor(prev->k->k.p);
+               : bpos_successor(prev->k->k.p);
        char buf1[200], buf2[200];
        bool update_min = false;
        bool update_max = false;
@@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
                }
 
-               if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+               if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
                                "btree node with incorrect min_key at btree %s level %u:\n"
                                "  prev %s\n"
                                "  cur %s",
@@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
        }
 
        if (fsck_err_on(is_last &&
-                       bkey_cmp(cur.k->k.p, node_end), c,
+                       bpos_cmp(cur.k->k.p, node_end), c,
                        "btree node with incorrect max_key at btree %s level %u:\n"
                        "  %s\n"
                        "  expected %s",
@@ -470,8 +470,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
        bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
-               BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
                ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
                                       k, &max_stale, true);
@@ -560,13 +560,13 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                return 0;
 
        six_lock_read(&b->c.lock, NULL, NULL);
-       if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+       if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
                        "btree root with incorrect min_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
                BUG();
        }
 
-       if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+       if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
                        "btree root with incorrect max_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
                BUG();
@@ -1148,7 +1148,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH);
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_NOT_EXTENTS|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
@@ -1171,6 +1173,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
                bch2_btree_iter_advance(iter);
        }
+       bch2_trans_iter_put(&trans, iter);
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
@@ -1271,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
        /* Find a format that all keys in @old_nodes can pack into */
        bch2_bkey_format_init(&format_state);
 
+       /*
+        * XXX: this won't correctly take it account the new min/max keys:
+        */
        for (i = 0; i < nr_old_nodes; i++)
                __bch2_btree_calc_format(&format_state, old_nodes[i]);
 
@@ -1333,7 +1339,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                     k < vstruct_last(s2) &&
                     vstruct_blocks_plus(n1->data, c->block_bits,
                                         u64s + k->u64s) <= blocks;
-                    k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+                    k = bkey_next(k)) {
                        last = k;
                        u64s += k->u64s;
                }
@@ -1362,7 +1368,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                        n1->key.k.p = n1->data->max_key =
                                bkey_unpack_pos(n1, last);
 
-                       n2->data->min_key = bkey_successor(n1->data->max_key);
+                       n2->data->min_key = bpos_successor(n1->data->max_key);
 
                        memcpy_u64s(vstruct_last(s1),
                                    s2->start, u64s);
@@ -1405,7 +1411,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                unsigned j;
 
                for (j = 0; j < nr_new_nodes; j++)
-                       if (!bkey_cmp(old_nodes[i]->key.k.p,
+                       if (!bpos_cmp(old_nodes[i]->key.k.p,
                                      new_nodes[j]->key.k.p))
                                goto next;
 
index c3d02f5804ba32ea64d5c018ffb9bc8b9ab0760b..b1362a9f94186e75c0e206293b782aef6ae03d06 100644 (file)
@@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
 
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-       if (l.phase != r.phase)
-               return l.phase < r.phase ? -1 : 1;
-       if (bkey_cmp(l.pos, r.pos))
-               return bkey_cmp(l.pos, r.pos);
-       if (l.level != r.level)
-               return l.level < r.level ? -1 : 1;
-       return 0;
+       return  cmp_int(l.phase, r.phase) ?:
+               bpos_cmp(l.pos, r.pos) ?:
+               cmp_int(l.level, r.level);
 }
 
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
index 9b74e7997f7bfbddff6b3bcbb4fad64ca9735e1a..b43d4468ad8ec1b9a27c0d9c53df3d3bacff6859 100644 (file)
@@ -32,13 +32,13 @@ static void verify_no_dups(struct btree *b,
        if (start == end)
                return;
 
-       for (p = start, k = bkey_next_skip_noops(start, end);
+       for (p = start, k = bkey_next(start);
             k != end;
-            p = k, k = bkey_next_skip_noops(k, end)) {
+            p = k, k = bkey_next(k)) {
                struct bkey l = bkey_unpack_key(b, p);
                struct bkey r = bkey_unpack_key(b, k);
 
-               BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+               BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
        }
 #endif
 }
@@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v)
 {
        struct bkey_packed *k;
 
-       for (k = i->start;
-            k != vstruct_last(i);
-            k = bkey_next_skip_noops(k, vstruct_last(i)))
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
                k->needs_whiteout = v;
 }
 
@@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
                out = i->start;
 
                for (k = start; k != end; k = n) {
-                       n = bkey_next_skip_noops(k, end);
+                       n = bkey_next(k);
 
                        if (!bkey_deleted(k)) {
                                bkey_copy(out, k);
@@ -614,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                             BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect level");
 
-               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-                       u64 *p = (u64 *) &bn->ptr;
-
-                       *p = swab64(*p);
-               }
-
                if (!write)
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
@@ -633,14 +625,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                                b->data->max_key = b->key.k.p;
                        }
 
-                       btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+                       btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
                                     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                                     "incorrect min_key: got %s should be %s",
                                     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
                                     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
                }
 
-               btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
+               btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
                             BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect max key %s",
                             (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
@@ -754,7 +746,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                }
 
                prev = k;
-               k = bkey_next_skip_noops(k, vstruct_last(i));
+               k = bkey_next(k);
        }
 fsck_err:
        return ret;
@@ -947,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        bp.v->mem_ptr = 0;
                }
 
-               k = bkey_next_skip_noops(k, vstruct_last(i));
+               k = bkey_next(k);
        }
 
        bch2_bset_build_aux_tree(b, b->set, false);
@@ -1327,8 +1319,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
                return -1;
 
-       ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
-               validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
+       ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+               validate_bset(c, NULL, b, i, sectors, WRITE, false);
        if (ret) {
                bch2_inconsistent_error(c);
                dump_stack();
@@ -1481,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                validate_before_checksum = true;
 
        /* validate_bset will be modifying: */
-       if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
+       if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        /* if we're going to be encrypting, check metadata validity first: */
index 16ce6dff6af7eb00c62d4b670a01676e53e07f8c..9c14cd30a09e18d112c94e680cdbe80279b0852b 100644 (file)
@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-                                unsigned version, unsigned big_endian,
-                                int write, struct bkey_format *f)
+                                 unsigned version, unsigned big_endian,
+                                 int write, struct bkey_format *f)
 {
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_id == BTREE_ID_inodes) {
@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                swap(f->field_offset[BKEY_FIELD_INODE],
                     f->field_offset[BKEY_FIELD_OFFSET]);
        }
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           (level || btree_type_has_snapshots(btree_id))) {
+               u64 max_packed =
+                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+               f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+                       ? 0
+                       : U32_MAX - max_packed;
+       }
 }
 
 static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -220,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 {
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_node_type_is_extents(btree_id) &&
-           bkey_cmp(bn->min_key, POS_MIN) &&
+           bpos_cmp(bn->min_key, POS_MIN) &&
            write)
-               bn->min_key = bkey_predecessor(bn->min_key);
+               bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           write)
+               bn->max_key.snapshot = 0;
 
        compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
        compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
 
+       if (version < bcachefs_metadata_version_snapshot &&
+           !write)
+               bn->max_key.snapshot = U32_MAX;
+
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_node_type_is_extents(btree_id) &&
-           bkey_cmp(bn->min_key, POS_MIN) &&
+           bpos_cmp(bn->min_key, POS_MIN) &&
            !write)
-               bn->min_key = bkey_successor(bn->min_key);
+               bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
 #endif /* _BCACHEFS_BTREE_IO_H */
index 459d27ca38bd75917123c5189fecca6434b7904b..8190e73d24212f28a6ebb1dd7fb04116db4477c6 100644 (file)
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_successor(p);
+       } else {
+               p = bpos_nosnap_successor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_predecessor(p);
+       } else {
+               p = bpos_nosnap_predecessor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
        return l < BTREE_MAX_DEPTH &&
@@ -30,20 +60,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 
        if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
            bkey_cmp(pos, POS_MAX))
-               pos = bkey_successor(pos);
+               pos = bkey_successor(iter, pos);
        return pos;
 }
 
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
                                              struct btree *b)
 {
-       return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
+       return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
                                             struct btree *b)
 {
-       return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
+       return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -285,7 +315,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
                /* Must lock btree nodes in key order: */
                if (btree_node_locked(linked, level) &&
-                   bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+                   bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
                                                 btree_iter_type(linked))) <= 0) {
                        deadlock_iter = linked;
                        reason = 7;
@@ -583,10 +613,24 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+       enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
        EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
+       BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type == BTREE_ITER_NODES &&
+              !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type != BTREE_ITER_NODES &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              !btree_type_has_snapshots(iter->btree_id));
+
        bch2_btree_iter_verify_locks(iter);
 
        for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -597,6 +641,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
        enum btree_iter_type type = btree_iter_type(iter);
 
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
        BUG_ON((type == BTREE_ITER_KEYS ||
                type == BTREE_ITER_CACHED) &&
               (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
@@ -1384,7 +1431,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        if (!b)
                return NULL;
 
-       BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+       BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
        iter->pos = iter->real_pos = b->key.k.p;
 
@@ -1421,12 +1468,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        if (!b)
                return NULL;
 
-       if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+       if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
                /*
                 * Haven't gotten to the end of the parent node: go back down to
                 * the next child node
                 */
-               btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
+               btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
 
                /* Unlock to avoid screwing up our lock invariants: */
                btree_node_unlock(iter, iter->level);
@@ -1453,7 +1500,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-       int cmp = bkey_cmp(new_pos, iter->real_pos);
+       int cmp = bpos_cmp(new_pos, iter->real_pos);
        unsigned l = iter->level;
 
        if (!cmp)
@@ -1497,10 +1544,10 @@ out:
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
        struct bpos pos = iter->k.p;
-       bool ret = bkey_cmp(pos, POS_MAX) != 0;
+       bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
        if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_successor(pos);
+               pos = bkey_successor(iter, pos);
        bch2_btree_iter_set_pos(iter, pos);
        return ret;
 }
@@ -1508,10 +1555,10 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
        struct bpos pos = bkey_start_pos(&iter->k);
-       bool ret = bkey_cmp(pos, POS_MIN) != 0;
+       bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
        if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_predecessor(pos);
+               pos = bkey_predecessor(iter, pos);
        bch2_btree_iter_set_pos(iter, pos);
        return ret;
 }
@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
        struct bpos next_pos = iter->l[0].b->key.k.p;
-       bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
+       bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
 
        /*
         * Typically, we don't want to modify iter->pos here, since that
@@ -1527,7 +1574,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
         * btree, in that case we want iter->pos to reflect that:
         */
        if (ret)
-               btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+               btree_iter_set_search_pos(iter, bpos_successor(next_pos));
        else
                bch2_btree_iter_set_pos(iter, POS_MAX);
 
@@ -1537,10 +1584,10 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
        struct bpos next_pos = iter->l[0].b->data->min_key;
-       bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
+       bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
        if (ret)
-               btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+               btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
        else
                bch2_btree_iter_set_pos(iter, POS_MIN);
 
@@ -1586,13 +1633,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
                k = btree_iter_level_peek(iter, &iter->l[0]);
 
                if (next_update &&
-                   bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+                   bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
                        k = bkey_i_to_s_c(next_update);
 
                if (likely(k.k)) {
                        if (bkey_deleted(k.k)) {
                                btree_iter_set_search_pos(iter,
-                                               bkey_successor(k.k->p));
+                                               bkey_successor(iter, k.k->p));
                                continue;
                        }
 
@@ -1731,7 +1778,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
                if (iter->pos.inode == KEY_INODE_MAX)
                        return bkey_s_c_null;
 
-               bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+               bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
        }
 
        pos = iter->pos;
@@ -1965,6 +2012,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 {
        struct btree_iter *iter, *best = NULL;
 
+       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+           !btree_type_has_snapshots(btree_id))
+               flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+               pos.snapshot = btree_type_has_snapshots(btree_id)
+                       ? U32_MAX : 0;
+
        /* We always want a fresh iterator for node iterators: */
        if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
                goto alloc_iter;
@@ -1999,11 +2054,14 @@ alloc_iter:
 
        if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
            btree_node_type_is_extents(btree_id) &&
-           !(flags & BTREE_ITER_NOT_EXTENTS))
+           !(flags & BTREE_ITER_NOT_EXTENTS) &&
+           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
                flags |= BTREE_ITER_IS_EXTENTS;
 
        iter->flags = flags;
 
+       iter->snapshot = pos.snapshot;
+
        if (!(iter->flags & BTREE_ITER_INTENT))
                bch2_btree_iter_downgrade(iter);
        else if (!iter->locks_want)
@@ -2026,6 +2084,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
                __bch2_trans_get_iter(trans, btree_id, pos,
                                       BTREE_ITER_NODES|
                                       BTREE_ITER_NOT_EXTENTS|
+                                      BTREE_ITER_ALL_SNAPSHOTS|
                                       flags);
        unsigned i;
 
@@ -2127,6 +2186,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
        trans->nr_updates2              = 0;
        trans->mem_top                  = 0;
 
+       trans->hooks                    = NULL;
        trans->extra_journal_entries    = NULL;
        trans->extra_journal_entry_u64s = 0;
 
@@ -2137,7 +2197,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
 
-       bch2_trans_cond_resched(trans);
+       if (!(flags & TRANS_RESET_NOUNLOCK))
+               bch2_trans_cond_resched(trans);
 
        if (!(flags & TRANS_RESET_NOTRAVERSE))
                bch2_btree_iter_traverse_all(trans);
index 8768f4cb96fae46045ab17f7ad9e493bce963eb1..7585f989ad505ba151185b1159df3d19952ad097 100644 (file)
@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *);
 
 static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               new_pos.snapshot = iter->snapshot;
+
        bkey_init(&iter->k);
        iter->k.p = iter->pos = new_pos;
 }
@@ -303,6 +306,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
 }
 
 #define TRANS_RESET_NOTRAVERSE         (1 << 0)
+#define TRANS_RESET_NOUNLOCK           (1 << 1)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
index 0b3545637bb39819690c20ee4e13480f09bbe31c..04354f5682ce2a6499961f6946c63e112b8bf607 100644 (file)
@@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
        const struct bkey_cached_key *key = arg->key;
 
        return cmp_int(ck->key.btree_id, key->btree_id) ?:
-               bkey_cmp(ck->key.pos, key->pos);
+               bpos_cmp(ck->key.pos, key->pos);
 }
 
 static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c,
                                      bch2_btree_key_cache_params));
        memset(&ck->key, ~0, sizeof(ck->key));
 
-       c->nr_keys--;
+       atomic_long_dec(&c->nr_keys);
 }
 
 static void bkey_cached_free(struct btree_key_cache *bc,
@@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
        struct bkey_cached *ck;
 
-       list_for_each_entry_reverse(ck, &c->freed, list)
-               if (bkey_cached_lock_for_evict(ck)) {
-                       c->nr_freed--;
-                       return ck;
-               }
-
        ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
        if (likely(ck)) {
                INIT_LIST_HEAD(&ck->list);
@@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c)
                return ck;
        }
 
-       list_for_each_entry(ck, &c->clean, list)
+       return NULL;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct bkey_cached *ck;
+       unsigned i;
+
+       mutex_lock(&c->lock);
+       list_for_each_entry_reverse(ck, &c->freed, list)
                if (bkey_cached_lock_for_evict(ck)) {
-                       bkey_cached_evict(c, ck);
+                       c->nr_freed--;
+                       list_del(&ck->list);
+                       mutex_unlock(&c->lock);
                        return ck;
                }
+       mutex_unlock(&c->lock);
+
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+       for (i = 0; i < tbl->size; i++)
+               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+                           bkey_cached_lock_for_evict(ck)) {
+                               bkey_cached_evict(c, ck);
+                               rcu_read_unlock();
+                               return ck;
+                       }
+               }
+       rcu_read_unlock();
 
        return NULL;
 }
@@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c,
                       struct bpos pos)
 {
        struct bkey_cached *ck;
+       bool was_new = true;
 
        ck = bkey_cached_alloc(c);
-       if (!ck)
-               return ERR_PTR(-ENOMEM);
+
+       if (unlikely(!ck)) {
+               ck = bkey_cached_reuse(c);
+               if (unlikely(!ck))
+                       return ERR_PTR(-ENOMEM);
+
+               was_new = false;
+       }
 
        ck->c.level             = 0;
        ck->c.btree_id          = btree_id;
@@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c,
        ck->valid               = false;
        ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 
-       if (rhashtable_lookup_insert_fast(&c->table,
+       if (unlikely(rhashtable_lookup_insert_fast(&c->table,
                                          &ck->hash,
-                                         bch2_btree_key_cache_params)) {
+                                         bch2_btree_key_cache_params))) {
                /* We raced with another fill: */
-               bkey_cached_free(c, ck);
+
+               if (likely(was_new)) {
+                       six_unlock_write(&ck->c.lock);
+                       six_unlock_intent(&ck->c.lock);
+                       kfree(ck);
+               } else {
+                       mutex_lock(&c->lock);
+                       bkey_cached_free(c, ck);
+                       mutex_unlock(&c->lock);
+               }
+
                return NULL;
        }
 
-       c->nr_keys++;
+       atomic_long_inc(&c->nr_keys);
 
-       list_move(&ck->list, &c->clean);
        six_unlock_write(&ck->c.lock);
 
        return ck;
@@ -213,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
        const struct btree_iter *iter = p;
 
        return ck->key.btree_id == iter->btree_id &&
-               !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+               !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
 __flatten
@@ -238,11 +276,8 @@ retry:
                        return 0;
                }
 
-               mutex_lock(&c->btree_key_cache.lock);
                ck = btree_key_cache_create(&c->btree_key_cache,
                                            iter->btree_id, iter->pos);
-               mutex_unlock(&c->btree_key_cache.lock);
-
                ret = PTR_ERR_OR_ZERO(ck);
                if (ret)
                        goto err;
@@ -257,7 +292,7 @@ retry:
                if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
                                     bkey_cached_check_fn, iter, _THIS_IP_)) {
                        if (ck->key.btree_id != iter->btree_id ||
-                           bkey_cmp(ck->key.pos, iter->pos)) {
+                           bpos_cmp(ck->key.pos, iter->pos)) {
                                goto retry;
                        }
 
@@ -267,7 +302,7 @@ retry:
                }
 
                if (ck->key.btree_id != iter->btree_id ||
-                   bkey_cmp(ck->key.pos, iter->pos)) {
+                   bpos_cmp(ck->key.pos, iter->pos)) {
                        six_unlock_type(&ck->c.lock, lock_want);
                        goto retry;
                }
@@ -370,15 +405,13 @@ err:
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
 
+       BUG_ON(!btree_node_locked(c_iter, 0));
+
        if (!evict) {
-               mutex_lock(&c->btree_key_cache.lock);
                if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                        clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-                       c->btree_key_cache.nr_dirty--;
+                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
                }
-
-               list_move_tail(&ck->list, &c->btree_key_cache.clean);
-               mutex_unlock(&c->btree_key_cache.lock);
        } else {
 evict:
                BUG_ON(!btree_node_intent_locked(c_iter, 0));
@@ -388,13 +421,14 @@ evict:
 
                six_lock_write(&ck->c.lock, NULL, NULL);
 
-               mutex_lock(&c->btree_key_cache.lock);
                if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                        clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-                       c->btree_key_cache.nr_dirty--;
+                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
                }
 
                bkey_cached_evict(&c->btree_key_cache, ck);
+
+               mutex_lock(&c->btree_key_cache.lock);
                bkey_cached_free(&c->btree_key_cache, ck);
                mutex_unlock(&c->btree_key_cache.lock);
        }
@@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
        ck->valid = true;
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               mutex_lock(&c->btree_key_cache.lock);
-               list_move(&ck->list, &c->btree_key_cache.dirty);
-
                set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-               c->btree_key_cache.nr_dirty++;
+               atomic_long_inc(&c->btree_key_cache.nr_dirty);
 
                if (bch2_nr_btree_keys_need_flush(c))
                        kick_reclaim = true;
-
-               mutex_unlock(&c->btree_key_cache.lock);
        }
 
        bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
@@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
        struct bch_fs *c = container_of(shrink, struct bch_fs,
                                        btree_key_cache.shrink);
        struct btree_key_cache *bc = &c->btree_key_cache;
+       struct bucket_table *tbl;
        struct bkey_cached *ck, *t;
        size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-       unsigned flags;
+       unsigned start, flags;
+       int srcu_idx;
 
        /* Return -1 if we can't do anything right now */
        if (sc->gfp_mask & __GFP_FS)
@@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
        else if (!mutex_trylock(&bc->lock))
                return -1;
 
+       srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        flags = memalloc_nofs_save();
 
        /*
@@ -540,23 +572,40 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
        if (scanned >= nr)
                goto out;
 
-       list_for_each_entry_safe(ck, t, &bc->clean, list) {
-               if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-                       clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-               else if (bkey_cached_lock_for_evict(ck)) {
-                       bkey_cached_evict(bc, ck);
-                       bkey_cached_free(bc, ck);
-               }
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+       if (bc->shrink_iter >= tbl->size)
+               bc->shrink_iter = 0;
+       start = bc->shrink_iter;
 
-               scanned++;
-               if (scanned >= nr) {
-                       if (&t->list != &bc->clean)
-                               list_move_tail(&bc->clean, &t->list);
-                       goto out;
+       do {
+               struct rhash_head *pos, *next;
+
+               rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) {
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+                               continue;
+
+                       if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+                               clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+                       else if (bkey_cached_lock_for_evict(ck)) {
+                               bkey_cached_evict(bc, ck);
+                               bkey_cached_free(bc, ck);
+                       }
+
+                       scanned++;
+                       if (scanned >= nr)
+                               break;
                }
-       }
+
+               bc->shrink_iter++;
+               if (bc->shrink_iter >= tbl->size)
+                       bc->shrink_iter = 0;
+       } while (scanned < nr && bc->shrink_iter != start);
+
+       rcu_read_unlock();
 out:
        memalloc_nofs_restore(flags);
+       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
        mutex_unlock(&bc->lock);
 
        return freed;
@@ -569,41 +618,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
                                        btree_key_cache.shrink);
        struct btree_key_cache *bc = &c->btree_key_cache;
 
-       return bc->nr_keys;
+       return atomic_long_read(&bc->nr_keys);
 }
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
        struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+       struct bucket_table *tbl;
        struct bkey_cached *ck, *n;
+       struct rhash_head *pos;
+       unsigned i;
 
        if (bc->shrink.list.next)
                unregister_shrinker(&bc->shrink);
 
        mutex_lock(&bc->lock);
-       list_splice(&bc->dirty, &bc->clean);
 
-       list_for_each_entry_safe(ck, n, &bc->clean, list) {
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+       for (i = 0; i < tbl->size; i++)
+               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                       bkey_cached_evict(bc, ck);
+                       list_add(&ck->list, &bc->freed);
+               }
+       rcu_read_unlock();
+
+       list_for_each_entry_safe(ck, n, &bc->freed, list) {
                cond_resched();
 
                bch2_journal_pin_drop(&c->journal, &ck->journal);
                bch2_journal_preres_put(&c->journal, &ck->res);
 
-               kfree(ck->k);
                list_del(&ck->list);
+               kfree(ck->k);
                kmem_cache_free(bch2_key_cache, ck);
-               bc->nr_keys--;
        }
 
-       BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
-       BUG_ON(bc->nr_keys);
-
-       list_for_each_entry_safe(ck, n, &bc->freed, list) {
-               cond_resched();
+       BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+       BUG_ON(atomic_long_read(&bc->nr_keys));
 
-               list_del(&ck->list);
-               kmem_cache_free(bch2_key_cache, ck);
-       }
        mutex_unlock(&bc->lock);
 
        if (bc->table_init_done)
@@ -614,8 +667,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
        mutex_init(&c->lock);
        INIT_LIST_HEAD(&c->freed);
-       INIT_LIST_HEAD(&c->clean);
-       INIT_LIST_HEAD(&c->dirty);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
@@ -641,8 +692,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
        pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
-       pr_buf(out, "nr_keys:\t%zu\n",  c->nr_keys);
-       pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
+       pr_buf(out, "nr_keys:\t%zu\n",  atomic_long_read(&c->nr_keys));
+       pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
 }
 
 void bch2_btree_key_cache_exit(void)
index 2f8b5521718aa44613b764d7cb83cc3701b8e81f..02715cd258ab0e235557adb246d1639ab236b578 100644 (file)
@@ -3,8 +3,8 @@
 
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
-       size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-       size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
        size_t max_dirty = 1024 + nr_keys  / 2;
 
        return max_t(ssize_t, 0, nr_dirty - max_dirty);
@@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 
 static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 {
-       size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-       size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
        size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
        return nr_dirty > max_dirty &&
index 5999044ab023357b1fbcb8e451655c6b907cc906..1941616f0ecf19d5d80f0eb814742770e4a26bef 100644 (file)
@@ -216,6 +216,7 @@ enum btree_iter_type {
 #define BTREE_ITER_CACHED_NOFILL       (1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE     (1 << 10)
 #define BTREE_ITER_NOT_EXTENTS         (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
 
 enum btree_iter_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -245,6 +246,8 @@ struct btree_iter {
        /* what we're searching for/what the iterator actually points to: */
        struct bpos             real_pos;
        struct bpos             pos_after_commit;
+       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+       unsigned                snapshot;
 
        u16                     flags;
        u8                      idx;
@@ -292,13 +295,12 @@ struct btree_key_cache {
        struct rhashtable       table;
        bool                    table_init_done;
        struct list_head        freed;
-       struct list_head        clean;
-       struct list_head        dirty;
        struct shrinker         shrink;
+       unsigned                shrink_iter;
 
        size_t                  nr_freed;
-       size_t                  nr_keys;
-       size_t                  nr_dirty;
+       atomic_long_t           nr_keys;
+       atomic_long_t           nr_dirty;
 };
 
 struct bkey_cached_key {
@@ -330,7 +332,7 @@ struct bkey_cached {
 struct btree_insert_entry {
        unsigned                trigger_flags;
        u8                      bkey_type;
-       u8                      btree_id;
+       enum btree_id           btree_id:8;
        u8                      level;
        unsigned                trans_triggers_run:1;
        unsigned                is_extent:1;
@@ -344,6 +346,14 @@ struct btree_insert_entry {
 #define BTREE_ITER_MAX         32
 #endif
 
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+       btree_trans_commit_hook_fn      *fn;
+       struct btree_trans_commit_hook  *next;
+};
+
 struct btree_trans {
        struct bch_fs           *c;
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -378,6 +388,7 @@ struct btree_trans {
        struct btree_insert_entry *updates2;
 
        /* update path: */
+       struct btree_trans_commit_hook *hooks;
        struct jset_entry       *extra_journal_entries;
        unsigned                extra_journal_entry_u64s;
        struct journal_entry_pin *journal_pin;
@@ -600,6 +611,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
         BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_HAS_SNAPSHOTS                         \
+       ((1U << BTREE_ID_extents)|                      \
+        (1U << BTREE_ID_inodes)|                       \
+        (1U << BTREE_ID_dirents)|                      \
+        (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+       return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
+
 enum btree_trigger_flags {
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
index a251380801692fc232ff2944e7d9d127e6b415e1..4ce12ae29a556ff3edcfe881c40787c071996dd8 100644 (file)
@@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
                      struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+                           struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
 /**
index a661bc0cf98aad8ee227ae6e42efeb19769c2960..19dfc32e8c685c6b6b0725a7be9afbd6ab3f9953 100644 (file)
@@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                        break;
                bp = bkey_s_c_to_btree_ptr_v2(k);
 
-               if (bkey_cmp(next_node, bp.v->min_key)) {
+               if (bpos_cmp(next_node, bp.v->min_key)) {
                        bch2_dump_btree_node(c, b);
                        panic("expected next min_key %s got %s\n",
                              (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
@@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                bch2_btree_node_iter_advance(&iter, b);
 
                if (bch2_btree_node_iter_end(&iter)) {
-                       if (bkey_cmp(k.k->p, b->key.k.p)) {
+                       if (bpos_cmp(k.k->p, b->key.k.p)) {
                                bch2_dump_btree_node(c, b);
                                panic("expected end %s got %s\n",
                                      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                        break;
                }
 
-               next_node = bkey_successor(k.k->p);
+               next_node = bpos_successor(k.k->p);
        }
 #endif
 }
@@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
        struct bset_tree *t;
        struct bkey uk;
 
-       bch2_bkey_format_add_pos(s, b->data->min_key);
-
        for_each_bset(b, t)
                bset_tree_for_each_key(b, t, k)
                        if (!bkey_deleted(k)) {
@@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
        struct bkey_format_state s;
 
        bch2_bkey_format_init(&s);
+       bch2_bkey_format_add_pos(&s, b->data->min_key);
+       bch2_bkey_format_add_pos(&s, b->data->max_key);
        __bch2_btree_calc_format(&s, b);
 
        return bch2_bkey_format_done(&s);
@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b->data->flags = 0;
        SET_BTREE_NODE_ID(b->data, as->btree_id);
        SET_BTREE_NODE_LEVEL(b->data, level);
-       b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
                struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -1095,10 +1094,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
                                        struct btree *n1,
                                        struct btree_iter *iter)
 {
+       struct bkey_format_state s;
        size_t nr_packed = 0, nr_unpacked = 0;
        struct btree *n2;
        struct bset *set1, *set2;
-       struct bkey_packed *k, *prev = NULL;
+       struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+       struct bpos n1_pos;
 
        n2 = bch2_btree_node_alloc(as, n1->c.level);
        bch2_btree_update_add_new_node(as, n2);
@@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
        SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
        n2->key.k.p = n1->key.k.p;
 
-       btree_node_set_format(n2, n2->data->format);
-
        set1 = btree_bset_first(n1);
        set2 = btree_bset_first(n2);
 
@@ -1119,7 +1118,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
         */
        k = set1->start;
        while (1) {
-               struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+               struct bkey_packed *n = bkey_next(k);
 
                if (n == vstruct_last(set1))
                        break;
@@ -1136,33 +1135,53 @@ static struct btree *__btree_split_node(struct btree_update *as,
        }
 
        BUG_ON(!prev);
+       set2_start      = k;
+       set2_end        = vstruct_last(set1);
 
-       btree_set_max(n1, bkey_unpack_pos(n1, prev));
-       btree_set_min(n2, bkey_successor(n1->key.k.p));
-
-       set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
-       set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+       set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
        set_btree_bset_end(n1, n1->set);
-       set_btree_bset_end(n2, n2->set);
-
-       n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
-       n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
-       n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
-       n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
 
        n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
        n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
        n1->nr.packed_keys      = nr_packed;
        n1->nr.unpacked_keys    = nr_unpacked;
 
+       n1_pos = bkey_unpack_pos(n1, prev);
+       if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+               n1_pos.snapshot = U32_MAX;
+
+       btree_set_max(n1, n1_pos);
+       btree_set_min(n2, bpos_successor(n1->key.k.p));
+
+       bch2_bkey_format_init(&s);
+       bch2_bkey_format_add_pos(&s, n2->data->min_key);
+       bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+               struct bkey uk = bkey_unpack_key(n1, k);
+               bch2_bkey_format_add_key(&s, &uk);
+       }
+
+       n2->data->format = bch2_bkey_format_done(&s);
+       btree_node_set_format(n2, n2->data->format);
+
+       out = set2->start;
+       memset(&n2->nr, 0, sizeof(n2->nr));
+
+       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+               BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+                                      ? &n1->format : &bch2_bkey_format_current, k));
+               out->format = KEY_FORMAT_LOCAL_BTREE;
+               btree_keys_account_key_add(&n2->nr, 0, out);
+               out = bkey_next(out);
+       }
+
+       set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+       set_btree_bset_end(n2, n2->set);
+
        BUG_ON(!set1->u64s);
        BUG_ON(!set2->u64s);
 
-       memcpy_u64s(set2->start,
-                   vstruct_end(set1),
-                   le16_to_cpu(set2->u64s));
-
        btree_node_reset_sib_u64s(n1);
        btree_node_reset_sib_u64s(n2);
 
@@ -1216,7 +1235,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        i = btree_bset_first(b);
        src = dst = i->start;
        while (src != vstruct_last(i)) {
-               n = bkey_next_skip_noops(src, vstruct_last(i));
+               n = bkey_next(src);
                if (!bkey_deleted(src)) {
                        memmove_u64s_down(dst, src, src->u64s);
                        dst = bkey_next(dst);
@@ -1563,8 +1582,10 @@ retry:
        }
 
        bch2_bkey_format_init(&new_s);
-       __bch2_btree_calc_format(&new_s, b);
-       __bch2_btree_calc_format(&new_s, m);
+       bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+       __bch2_btree_calc_format(&new_s, prev);
+       __bch2_btree_calc_format(&new_s, next);
+       bch2_bkey_format_add_pos(&new_s, next->data->max_key);
        new_f = bch2_bkey_format_done(&new_s);
 
        sib_u64s = btree_node_u64s_with_format(b, &new_f) +
index d9308bd49fc937d16e57b3f22dea511193f2d7e8..67a2c65bc8fb0ae3bf3b0cb666773982c05652db 100644 (file)
@@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 {
        return   cmp_int(l->btree_id,   r->btree_id) ?:
                 -cmp_int(l->level,     r->level) ?:
-                bkey_cmp(l->k->k.p,    r->k->k.p);
+                bpos_cmp(l->k->k.p,    r->k->k.p);
 }
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        EBUG_ON(btree_node_just_written(b));
        EBUG_ON(bset_written(b, btree_bset_last(b)));
        EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-       EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
-       EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+       EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+       EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
        EBUG_ON(insert->k.u64s >
                bch_btree_keys_u64s_remaining(iter->trans->c, b));
        EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
 
-       BUG_ON(bch2_debug_check_bkeys &&
-              bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
-       BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos));
+       if (bch2_debug_check_bkeys) {
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       panic("invalid bkey %s on insert: %s\n", buf, invalid);
+               }
+       }
+       BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
        BUG_ON(i->level         != i->iter->level);
        BUG_ON(i->btree_id      != i->iter->btree_id);
 }
@@ -369,6 +377,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bch_fs_usage *fs_usage = NULL;
        struct btree_insert_entry *i;
+       struct btree_trans_commit_hook *h;
        unsigned u64s = 0;
        bool marking = false;
        int ret;
@@ -386,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
        prefetch(&trans->c->journal.flags);
 
+       h = trans->hooks;
+       while (h) {
+               ret = h->fn(trans, h);
+               if (ret)
+                       return ret;
+               h = h->next;
+       }
+
        trans_for_each_update2(trans, i) {
                /* Multiple inserts might go to same leaf: */
                if (!same_leaf_as_prev(trans, i))
@@ -556,6 +573,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        if (trans->flags & BTREE_INSERT_NOUNLOCK)
                trans->nounlock = true;
 
+       if (!(trans->flags & BTREE_INSERT_NOUNLOCK))
        trans_for_each_update2(trans, i)
                if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
                    !same_leaf_as_prev(trans, i))
@@ -826,7 +844,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
        struct btree_insert_entry *i = NULL;
        struct btree_iter *iter;
        bool trans_trigger_run;
-       unsigned u64s;
+       unsigned u64s, reset_flags = 0;
        int ret = 0;
 
        if (!trans->nr_updates)
@@ -940,7 +958,11 @@ out:
        if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
                percpu_ref_put(&trans->c->writes);
 out_reset:
-       bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+       if (!ret)
+               reset_flags |= TRANS_RESET_NOTRAVERSE;
+       if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+               reset_flags |= TRANS_RESET_NOUNLOCK;
+       bch2_trans_reset(trans, reset_flags);
 
        return ret;
 err:
@@ -1053,6 +1075,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
        return 0;
 }
 
+void bch2_trans_commit_hook(struct btree_trans *trans,
+                           struct btree_trans_commit_hook *h)
+{
+       h->next = trans->hooks;
+       trans->hooks = h;
+}
+
 int __bch2_btree_insert(struct btree_trans *trans,
                        enum btree_id id, struct bkey_i *k)
 {
index c6d49f44aa01a28d2a0bea3ed2269670df4e4741..acf600387c9fe3378723289c9eb6c7bb8aa6e9af 100644 (file)
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+       iter = bch2_trans_get_iter(&trans, i->id, i->from,
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
        k = bch2_btree_iter_peek(iter);
 
        while (k.k && !(err = bkey_err(k))) {
@@ -273,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
        if (err)
                return err;
 
-       if (!i->size || !bkey_cmp(POS_MAX, i->from))
+       if (!i->size || !bpos_cmp(POS_MAX, i->from))
                return i->ret;
 
        bch2_trans_init(&trans, i->c, 0, 0);
@@ -289,8 +291,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                 * can't easily correctly restart a btree node traversal across
                 * all nodes, meh
                 */
-               i->from = bkey_cmp(POS_MAX, b->key.k.p)
-                       ? bkey_successor(b->key.k.p)
+               i->from = bpos_cmp(POS_MAX, b->key.k.p)
+                       ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
 
                if (!i->size)
index 592dd80cf963959f4b23f1ae4f463227bfa655e6..cf4ce2e7f29c1d51cad6c8c14f43822ea26a6079 100644 (file)
@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans,
                       u64 dir_inum, const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
-                      int flags)
+                      u64 *dir_offset, int flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                            dir_inum, &dirent->k_i, flags);
+       ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+                           dir_inum, &dirent->k_i, flags);
+       *dir_offset = dirent->k.p.offset;
+
+       return ret;
 }
 
 static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 int bch2_dirent_rename(struct btree_trans *trans,
                       u64 src_dir, struct bch_hash_info *src_hash,
                       u64 dst_dir, struct bch_hash_info *dst_hash,
-                      const struct qstr *src_name, u64 *src_inum,
-                      const struct qstr *dst_name, u64 *dst_inum,
+                      const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+                      const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
                       enum bch_rename_mode mode)
 {
        struct btree_iter *src_iter = NULL, *dst_iter = NULL;
@@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                new_dst->k.p = src_iter->pos;
                                bch2_trans_update(trans, src_iter,
                                                  &new_dst->k_i, 0);
-                               goto out;
+                               goto out_set_offset;
                        } else {
                                /* If we're overwriting, we can't insert new_dst
                                 * at a different slot because it has to
@@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
        bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
        bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+       *src_offset = new_src->k.p.offset;
+       *dst_offset = new_dst->k.p.offset;
 out:
        bch2_trans_iter_put(trans, src_iter);
        bch2_trans_iter_put(trans, dst_iter);
index 34769371dd13d1725dc96f2c6d01a5fb10d9ecd6..e1d8ce377d43755cd5584edf1afa05d0ba65495e 100644 (file)
@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 
 int bch2_dirent_create(struct btree_trans *, u64,
                       const struct bch_hash_info *, u8,
-                      const struct qstr *, u64, int);
+                      const struct qstr *, u64, u64 *, int);
 
 int bch2_dirent_delete_at(struct btree_trans *,
                          const struct bch_hash_info *,
@@ -46,8 +46,8 @@ enum bch_rename_mode {
 int bch2_dirent_rename(struct btree_trans *,
                       u64, struct bch_hash_info *,
                       u64, struct bch_hash_info *,
-                      const struct qstr *, u64 *,
-                      const struct qstr *, u64 *,
+                      const struct qstr *, u64 *, u64 *,
+                      const struct qstr *, u64 *, u64 *,
                       enum bch_rename_mode);
 
 struct btree_iter *
index 1dba7e9994a7e656ca84a2729dff9576bbf8f4c6..f712f685dd0e09122dd4b91a4a63ac2e9fd83cd2 100644 (file)
@@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
                if (ret)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
index a7e0408213a9506d8b6d16fc4059d0de68ac0d3e..b07d39555eb6e1fe92c1d5233ef3013aae359e14 100644 (file)
@@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
-       if (bp.v->min_key.snapshot)
+       if (c->sb.version < bcachefs_metadata_version_snapshot &&
+           bp.v->min_key.snapshot)
                return "invalid min_key.snapshot";
 
        return bch2_bkey_ptrs_invalid(c, k);
@@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
            btree_node_type_is_extents(btree_id) &&
            bkey_cmp(bp.v->min_key, POS_MIN))
                bp.v->min_key = write
-                       ? bkey_predecessor(bp.v->min_key)
-                       : bkey_successor(bp.v->min_key);
+                       ? bpos_nosnap_predecessor(bp.v->min_key)
+                       : bpos_nosnap_successor(bp.v->min_key);
 }
 
 /* KEY_TYPE_extent: */
index c8069dfb90ffceae47f4b019f9b3d9a29ada12ee..ccee43a2019da10ca572faf6c6cf4463bc7930e5 100644 (file)
@@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s);
 
 /* Generic extent code: */
 
+enum bch_extent_overlap {
+       BCH_EXTENT_OVERLAP_ALL          = 0,
+       BCH_EXTENT_OVERLAP_BACK         = 1,
+       BCH_EXTENT_OVERLAP_FRONT        = 2,
+       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+                                                         const struct bkey *m)
+{
+       int cmp1 = bkey_cmp(k->p, m->p) < 0;
+       int cmp2 = bkey_cmp(bkey_start_pos(k),
+                           bkey_start_pos(m)) > 0;
+
+       return (cmp1 << 1) + cmp2;
+}
+
 int bch2_cut_front_s(struct bpos, struct bkey_s);
 int bch2_cut_back_s(struct bpos, struct bkey_s);
 
index 503ce1920f395f935a7f3ba57f2cad2afee70bc2..83c2168ce480a92169b68a22c82ea9a64a8a02dd 100644 (file)
@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL;
+       struct btree_iter *inode_iter = NULL;
        struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
-       u64 now = bch2_current_time(trans->c);
+       u64 now = bch2_current_time(c);
+       u64 dir_offset = 0;
        int ret;
 
        dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
@@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        if (!name)
                new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-       ret = bch2_inode_create(trans, new_inode);
+       inode_iter = bch2_inode_create(trans, new_inode);
+       ret = PTR_ERR_OR_ZERO(inode_iter);
        if (ret)
                goto err;
 
@@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
                ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
                                         mode_to_type(new_inode->bi_mode),
                                         name, new_inode->bi_inum,
+                                        &dir_offset,
                                         BCH_HASH_SET_MUST_CREATE);
                if (ret)
                        goto err;
        }
+
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               new_inode->bi_dir               = dir_u->bi_inum;
+               new_inode->bi_dir_offset        = dir_offset;
+       }
+
+       ret = bch2_inode_write(trans, inode_iter, new_inode);
 err:
+       bch2_trans_iter_put(trans, inode_iter);
        bch2_trans_iter_put(trans, dir_iter);
        return ret;
 }
@@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
                    u64 inum, struct bch_inode_unpacked *dir_u,
                    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
        struct bch_hash_info dir_hash;
-       u64 now = bch2_current_time(trans->c);
+       u64 now = bch2_current_time(c);
+       u64 dir_offset = 0;
        int ret;
 
        inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
@@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
        inode_u->bi_ctime = now;
        bch2_inode_nlink_inc(inode_u);
 
+       inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
        dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
        ret = PTR_ERR_OR_ZERO(dir_iter);
        if (ret)
@@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
        dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-       dir_hash = bch2_hash_info_init(trans->c, dir_u);
+       dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret =   bch2_dirent_create(trans, dir_inum, &dir_hash,
-                                 mode_to_type(inode_u->bi_mode),
-                                 name, inum, BCH_HASH_SET_MUST_CREATE) ?:
-               bch2_inode_write(trans, dir_iter, dir_u) ?:
+       ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+                                mode_to_type(inode_u->bi_mode),
+                                name, inum, &dir_offset,
+                                BCH_HASH_SET_MUST_CREATE);
+       if (ret)
+               goto err;
+
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               inode_u->bi_dir         = dir_inum;
+               inode_u->bi_dir_offset  = dir_offset;
+       }
+
+       ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
                bch2_inode_write(trans, inode_iter, inode_u);
 err:
        bch2_trans_iter_put(trans, dir_iter);
@@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans,
                      struct bch_inode_unpacked *inode_u,
                      const struct qstr *name)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
                          *inode_iter = NULL;
        struct bch_hash_info dir_hash;
-       u64 inum, now = bch2_current_time(trans->c);
+       u64 inum, now = bch2_current_time(c);
        struct bkey_s_c k;
        int ret;
 
@@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       dir_hash = bch2_hash_info_init(trans->c, dir_u);
+       dir_hash = bch2_hash_info_init(c, dir_u);
 
        dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
                                                 name, BTREE_ITER_INTENT);
@@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans,
                      const struct qstr *dst_name,
                      enum bch_rename_mode mode)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
        struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
        struct bch_hash_info src_hash, dst_hash;
-       u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+       u64 src_inode, src_offset, dst_inode, dst_offset;
+       u64 now = bch2_current_time(c);
        int ret;
 
        src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
@@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+       src_hash = bch2_hash_info_init(c, src_dir_u);
 
        if (dst_dir != src_dir) {
                dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
@@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans,
                if (ret)
                        goto err;
 
-               dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+               dst_hash = bch2_hash_info_init(c, dst_dir_u);
        } else {
                dst_dir_u = src_dir_u;
                dst_hash = src_hash;
@@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans,
        ret = bch2_dirent_rename(trans,
                                 src_dir, &src_hash,
                                 dst_dir, &dst_hash,
-                                src_name, &src_inode,
-                                dst_name, &dst_inode,
+                                src_name, &src_inode, &src_offset,
+                                dst_name, &dst_inode, &dst_offset,
                                 mode);
        if (ret)
                goto err;
@@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans,
                        goto err;
        }
 
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               src_inode_u->bi_dir             = dst_dir_u->bi_inum;
+               src_inode_u->bi_dir_offset      = dst_offset;
+
+               if (mode == BCH_RENAME_EXCHANGE) {
+                       dst_inode_u->bi_dir             = src_dir_u->bi_inum;
+                       dst_inode_u->bi_dir_offset      = src_offset;
+               }
+       }
+
        if (mode == BCH_RENAME_OVERWRITE) {
                if (S_ISDIR(src_inode_u->bi_mode) !=
                    S_ISDIR(dst_inode_u->bi_mode)) {
index 9dc162f2e1759d0c9569c43213f259d4befae287..62788ae15eff3f9cf5078afaa93f9bd24f89f2db 100644 (file)
@@ -675,6 +675,39 @@ retry:
                        continue;
                }
 
+               if (!target.bi_nlink &&
+                   !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+                   (target.bi_dir != k.k->p.inode ||
+                    target.bi_dir_offset != k.k->p.offset) &&
+                   (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+                                "inode %llu has wrong backpointer:\n"
+                                "got       %llu:%llu\n"
+                                "should be %llu:%llu",
+                                d_inum,
+                                target.bi_dir,
+                                target.bi_dir_offset,
+                                k.k->p.inode,
+                                k.k->p.offset) ||
+                    c->opts.version_upgrade)) {
+                       struct bkey_inode_buf p;
+
+                       target.bi_dir           = k.k->p.inode;
+                       target.bi_dir_offset    = k.k->p.offset;
+                       bch2_trans_unlock(&trans);
+
+                       bch2_inode_pack(c, &p, &target);
+
+                       ret = bch2_btree_insert(c, BTREE_ID_inodes,
+                                               &p.inode.k_i, NULL, NULL,
+                                               BTREE_INSERT_NOFAIL|
+                                               BTREE_INSERT_LAZY_RW);
+                       if (ret) {
+                               bch_err(c, "error in fsck: error %i updating inode", ret);
+                               goto err;
+                       }
+                       continue;
+               }
+
                if (fsck_err_on(have_target &&
                                d.v->d_type !=
                                mode_to_type(target.bi_mode), c,
@@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans,
                do_update = true;
        }
 
+       if (!S_ISDIR(u.bi_mode) &&
+           u.bi_nlink &&
+           !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+           (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+                        "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+            c->opts.version_upgrade)) {
+               u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+               do_update = true;
+       }
+
        if (do_update) {
                struct bkey_inode_buf p;
 
index 4559e77f91f0215ee2bda66e2f2b801596e0809d..f1665ca85da694962c756b6b30b8490cf7b2aade 100644 (file)
@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans,
                return PTR_ERR(inode_p);
 
        bch2_inode_pack(trans->c, inode_p, inode);
+       inode_p->inode.k.p.snapshot = iter->snapshot;
        bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
        return 0;
 }
@@ -469,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
-int bch2_inode_create(struct btree_trans *trans,
-                     struct bch_inode_unpacked *inode_u)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+                                    struct bch_inode_unpacked *inode_u)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_inode_buf *inode_p;
        struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 min, max, start, *hint;
@@ -493,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans,
 
        if (start >= max || start < min)
                start = min;
-
-       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-       if (IS_ERR(inode_p))
-               return PTR_ERR(inode_p);
 again:
        for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -520,7 +516,7 @@ again:
        bch2_trans_iter_put(trans, iter);
 
        if (ret)
-               return ret;
+               return ERR_PTR(ret);
 
        if (start != min) {
                /* Retry from start */
@@ -528,15 +524,12 @@ again:
                goto again;
        }
 
-       return -ENOSPC;
+       return ERR_PTR(-ENOSPC);
 found_slot:
        *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
-
-       ret = bch2_inode_write(trans, iter, inode_u);
-       bch2_trans_iter_put(trans, iter);
-       return ret;
+       return iter;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
index 1caf036ae928fe822a8426fe952ac50774c891b6..6bad6dfb79891ccf941ded70f125bb4d33b61bfb 100644 (file)
@@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+                                    struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
index 284d398bf5859895ce6962c92bba05256c8b7b40..36b10cb7ae6285ae3b461cdf1f89024a4303138d 100644 (file)
@@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans,
 
                if (i_sectors_delta || new_i_size) {
                        bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+                       inode_p.inode.k.p.snapshot = iter->snapshot;
+
                        bch2_trans_update(trans, inode_iter,
                                          &inode_p.inode.k_i, 0);
                }
@@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                k = bch2_keylist_front(keys);
 
+               k->k.p.snapshot = iter->snapshot;
+
                bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
                bkey_copy(sk.k, k);
                bch2_cut_front(iter->pos, sk.k);
index 1f26139d9a1b8d463115027e7f5972e8a09f174a..69c553a6206f5ca9704868beac245f015779ffd3 100644 (file)
@@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
        if (dynamic_fault("bcachefs:add:journal_alloc"))
                return -ENOMEM;
 
+       /* 1/128th of the device by default: */
+       nr = ca->mi.nbuckets >> 7;
+
        /*
-        * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+        * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
         * is smaller:
         */
-       nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+       nr = clamp_t(unsigned, nr,
                     BCH_JOURNAL_BUCKETS_MIN,
-                    min(1 << 10,
-                        (1 << 20) / ca->mi.bucket_size));
+                    min(1 << 13,
+                        (1 << 24) / ca->mi.bucket_size));
 
        return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
index 54f2e2053bc0bb43acfd81c7af4e62dfa3f5b618..c7fa03cfbde6c316095c66a0d896afdd8319b6c2 100644 (file)
@@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl)
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
 
-       if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
+       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
index bbf8e5ad8aa09d4e290faa8a3ae0a9309a7f330c..4a5b50ed71b0dd46519915f57706eb2d1d89be59 100644 (file)
@@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                                j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
                                c->btree_cache.used,
-                               c->btree_key_cache.nr_dirty,
-                               c->btree_key_cache.nr_keys);
+                               atomic_long_read(&c->btree_key_cache.nr_dirty),
+                               atomic_long_read(&c->btree_key_cache.nr_keys));
 
                nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
 
index f863fd74bd7884772e299d5963fb58dbfa0e8d52..3d1bf87ec54468e5eab2a962cc270b4f0d20c654 100644 (file)
@@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id  l_btree_id,
 {
        return (cmp_int(l_btree_id,     r->btree_id) ?:
                cmp_int(l_level,        r->level) ?:
-               bkey_cmp(l_pos, r->k->k.p));
+               bpos_cmp(l_pos, r->k->k.p));
 }
 
 static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
 {
        return (cmp_int(l->btree_id,    r->btree_id) ?:
                cmp_int(l->level,       r->level) ?:
-               bkey_cmp(l->k->k.p,     r->k->k.p));
+               bpos_cmp(l->k->k.p,     r->k->k.p));
 }
 
 static size_t journal_key_search(struct journal_keys *journal_keys,
@@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
        if (iter->idx > idx ||
            (iter->idx == idx &&
             biter->last &&
-            bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+            bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
                iter->idx++;
 }
 
@@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                        bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
                if (btree_k.k && journal_k.k) {
-                       int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+                       int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
 
                        if (!cmp)
                                bch2_journal_iter_advance_btree(iter);
@@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                ret = iter->last == journal ? journal_k : btree_k;
 
                if (iter->b &&
-                   bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+                   bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
                        iter->journal.idx = iter->journal.keys->nr;
                        iter->last = none;
                        return bkey_s_c_null;
@@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
        return  cmp_int(l->btree_id,    r->btree_id) ?:
                cmp_int(l->level,       r->level) ?:
-               bkey_cmp(l->k->k.p, r->k->k.p) ?:
+               bpos_cmp(l->k->k.p, r->k->k.p) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
 }
@@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                while (src + 1 < keys.d + keys.nr &&
                       src[0].btree_id  == src[1].btree_id &&
                       src[0].level     == src[1].level &&
-                      !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+                      !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
                        src++;
 
                *dst++ = *src++;
@@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
        return  cmp_int(r->level,       l->level) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->btree_id,    r->btree_id) ?:
-               bkey_cmp(l->k->k.p,     r->k->k.p);
+               bpos_cmp(l->k->k.p,     r->k->k.p);
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        }
 
+       if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+               bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+               ret = -EINVAL;
+               goto err;
+
+       }
+
        if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
                bch_info(c, "alloc_v2 feature bit not set, fsck required");
                c->opts.fsck = true;
@@ -1338,6 +1345,7 @@ int bch2_fs_initialize(struct bch_fs *c)
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum = BCACHEFS_ROOT_INO;
        bch2_inode_pack(c, &packed_inode, &root_inode);
+       packed_inode.inode.k.p.snapshot = U32_MAX;
 
        err = "error creating root directory";
        ret = bch2_btree_insert(c, BTREE_ID_inodes,
index dfb12fdd481446beb2a4059c4d0476207bf4ace6..7507b6bcc13f2e47e2b57bfabc831e8d56254dcd 100644 (file)
@@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
                goto err;
        }
 err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
                goto err;
        }
 err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -113,7 +115,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        BUG_ON(i);
 err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -166,7 +169,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        BUG_ON(i);
 err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->p.offset != i);
                i += 2;
        }
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
 
        BUG_ON(i != nr * 2);
 
@@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                if (i == nr * 2)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 err:
        bch2_trans_exit(&trans);
        return ret;
@@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->size != 8);
                i += 16;
        }
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
 
        BUG_ON(i != nr);
 
@@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                if (i == nr)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 err:
        bch2_trans_exit(&trans);
        return 0;
@@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
 
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
 
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -473,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
        for (i = 0; i < nr; i++) {
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = test_rand();
+               k.k.p.snapshot = U32_MAX;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
@@ -508,7 +519,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
                }
        }
 
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -549,7 +560,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
                }
        }
 
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -630,6 +641,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
                if (++i == nr)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -645,6 +658,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
        for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
                ;
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -671,6 +686,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
                        break;
                }
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
        return ret;
 }
index 351eac7946a4950398c85d87648119ee788daedf..ba2196fc4ac1eb887c4ae66588999bf6f8d9fa07 100644 (file)
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -8,27 +9,29 @@
  * Code partially derived from nft_hash
  * Rewritten with rehash code from br_multicast plus single list
  * pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/atomic.h>
-#include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/jhash.h>
+#include <linux/overflow.h>
 #include <linux/random.h>
 #include <linux/rhashtable.h>
 #include <linux/err.h>
+#include <linux/export.h>
 
 #define HASH_DEFAULT_SIZE      64UL
 #define HASH_MIN_SIZE          4U
-#define BUCKET_LOCKS_PER_CPU   32UL
+
+union nested_table {
+       union nested_table __rcu *table;
+       struct rhash_lock_head __rcu *bucket;
+};
 
 static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
@@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht,
        return rht_head_hashfn(ht, tbl, he, ht->p);
 }
 
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
-                             gfp_t gfp)
-{
-       unsigned int i, size;
-       unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
 
-       nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
-       size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+       return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
-       /* Never allocate more than 0.5 locks per bucket */
-       size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+       if (!debug_locks)
+               return 1;
+       if (unlikely(tbl->nest))
+               return 1;
+       return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
 
-       if (sizeof(spinlock_t) != 0) {
-               tbl->locks = NULL;
-               if (gfp != GFP_KERNEL)
-                       gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+       const struct bucket_table *tbl)
+{
+       /* The top-level bucket entry does not need RCU protection
+        * because it's set at the same time as tbl->nest.
+        */
+       return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
 
-               if (!tbl->locks)
-                       tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-                                                  gfp);
-               if (!tbl->locks)
-                       return -ENOMEM;
-               for (i = 0; i < size; i++)
-                       spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       const unsigned int len = 1 << shift;
+       unsigned int i;
+
+       ntbl = rcu_dereference_protected(ntbl->table, 1);
+       if (!ntbl)
+               return;
+
+       if (size > len) {
+               size >>= shift;
+               for (i = 0; i < len; i++)
+                       nested_table_free(ntbl + i, size);
        }
-       tbl->locks_mask = size - 1;
 
-       return 0;
+       kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+       unsigned int size = tbl->size >> tbl->nest;
+       unsigned int len = 1 << tbl->nest;
+       union nested_table *ntbl;
+       unsigned int i;
+
+       ntbl = nested_table_top(tbl);
+
+       for (i = 0; i < len; i++)
+               nested_table_free(ntbl + i, size);
+
+       kfree(ntbl);
 }
 
 static void bucket_table_free(struct bucket_table *tbl)
 {
-       if (tbl)
-               kvfree(tbl->locks);
+       if (tbl->nest)
+               nested_bucket_table_free(tbl);
 
        kvfree(tbl);
 }
@@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
        bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+                                             union nested_table __rcu **prev,
+                                             bool leaf)
+{
+       union nested_table *ntbl;
+       int i;
+
+       ntbl = rcu_dereference(*prev);
+       if (ntbl)
+               return ntbl;
+
+       ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+       if (ntbl && leaf) {
+               for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+                       INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+       }
+
+       if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+               return ntbl;
+       /* Raced with another thread. */
+       kfree(ntbl);
+       return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+                                                     size_t nbuckets,
+                                                     gfp_t gfp)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       struct bucket_table *tbl;
+       size_t size;
+
+       if (nbuckets < (1 << (shift + 1)))
+               return NULL;
+
+       size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+       tbl = kzalloc(size, gfp);
+       if (!tbl)
+               return NULL;
+
+       if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+                               false)) {
+               kfree(tbl);
+               return NULL;
+       }
+
+       tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+       return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
@@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
        size_t size;
        int i;
 
-       size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-       if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
-           gfp != GFP_KERNEL)
-               tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-       if (tbl == NULL && gfp == GFP_KERNEL)
-               tbl = vzalloc(size);
-       if (tbl == NULL)
-               return NULL;
+       tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
 
-       tbl->size = nbuckets;
+       size = nbuckets;
 
-       if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
-               bucket_table_free(tbl);
-               return NULL;
+       if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+               tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+               nbuckets = 0;
        }
 
+       if (tbl == NULL)
+               return NULL;
+
+       tbl->size = size;
+
+       rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);
 
-       get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+       tbl->hash_rnd = get_random_u32();
 
        for (i = 0; i < nbuckets; i++)
-               INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+               INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 
        return tbl;
 }
@@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
        return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+                                struct rhash_lock_head __rcu **bkt,
+                                unsigned int old_hash)
 {
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-       struct bucket_table *new_tbl = rhashtable_last_table(ht,
-               rht_dereference_rcu(old_tbl->future_tbl, ht));
-       struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-       int err = -ENOENT;
+       struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+       int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
-       spinlock_t *new_bucket_lock;
+       struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
 
-       rht_for_each(entry, old_tbl, old_hash) {
+       if (new_tbl->nest)
+               goto out;
+
+       err = -ENOENT;
+
+       rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+                         old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 
        new_hash = head_hashfn(ht, new_tbl, entry);
 
-       new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+       rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
 
-       spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-       head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-                                     new_tbl, new_hash);
+       head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
        RCU_INIT_POINTER(entry->next, head);
 
-       rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
-       spin_unlock(new_bucket_lock);
+       rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
 
-       rcu_assign_pointer(*pprev, next);
+       if (pprev)
+               rcu_assign_pointer(*pprev, next);
+       else
+               /* Need to preserved the bit lock. */
+               rht_assign_locked(bkt, next);
 
 out:
        return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
 {
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-       spinlock_t *old_bucket_lock;
+       struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+       int err;
 
-       old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+       if (!bkt)
+               return 0;
+       rht_lock(old_tbl, bkt);
 
-       spin_lock_bh(old_bucket_lock);
-       while (!rhashtable_rehash_one(ht, old_hash))
+       while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;
-       old_tbl->rehash++;
-       spin_unlock_bh(old_bucket_lock);
+
+       if (err == -ENOENT)
+               err = 0;
+       rht_unlock(old_tbl, bkt);
+
+       return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
 {
-       /* Protect future_tbl using the first bucket lock. */
-       spin_lock_bh(old_tbl->locks);
-
-       /* Did somebody beat us to it? */
-       if (rcu_access_pointer(old_tbl->future_tbl)) {
-               spin_unlock_bh(old_tbl->locks);
-               return -EEXIST;
-       }
-
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
+        * As cmpxchg() provides strong barriers, we do not need
+        * rcu_assign_pointer().
         */
-       rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
-       spin_unlock_bh(old_tbl->locks);
+       if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+                   new_tbl) != NULL)
+               return -EEXIST;
 
        return 0;
 }
@@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
+       int err;
 
        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;
 
-       for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-               rhashtable_rehash_chain(ht, old_hash);
+       for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+               err = rhashtable_rehash_chain(ht, old_hash);
+               if (err)
+                       return err;
+               cond_resched();
+       }
 
        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);
@@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;
-       spin_unlock(&ht->lock);
 
        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
+        * We do this inside the locked region so that
+        * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+        * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+       spin_unlock(&ht->lock);
 
        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+                                  struct bucket_table *old_tbl,
+                                  unsigned int size)
 {
-       struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+       struct bucket_table *new_tbl;
        int err;
 
-       old_tbl = rhashtable_last_table(ht, old_tbl);
+       ASSERT_RHT_MUTEX(ht);
 
-       new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+       new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;
 
@@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht)
        return err;
 }
 
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:                the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-       struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+       struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;
-       int err;
 
        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
@@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;
 
-       new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-       if (new_tbl == NULL)
-               return -ENOMEM;
-
-       err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-       if (err)
-               bucket_table_free(new_tbl);
-
-       return err;
+       return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work)
        tbl = rhashtable_last_table(ht, tbl);
 
        if (rht_grow_above_75(ht, tbl))
-               rhashtable_expand(ht);
+               err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-               rhashtable_shrink(ht);
+               err = rhashtable_shrink(ht);
+       else if (tbl->nest)
+               err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+       if (!err || err == -EEXIST) {
+               int nerr;
 
-       err = rhashtable_rehash_table(ht);
+               nerr = rhashtable_rehash_table(ht);
+               err = err ?: nerr;
+       }
 
        mutex_unlock(&ht->mutex);
 
@@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work)
                schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-                                       struct bucket_table *tbl,
-                                       unsigned int hash)
-{
-       unsigned int elasticity = ht->elasticity;
-       struct rhash_head *head;
-
-       rht_for_each(head, tbl, hash)
-               if (!--elasticity)
-                       return true;
-
-       return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-                            struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+                                   struct bucket_table *tbl)
 {
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
@@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
        err = -ENOMEM;
 
-       new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+       new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;
 
@@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
        /* Do not fail the insert if someone else did a rehash. */
-       if (likely(rcu_dereference_raw(tbl->future_tbl)))
+       if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;
 
        /* Schedule async rehash to retry allocation in process context. */
@@ -373,57 +480,485 @@ fail:
        return err;
 }
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-                                           const void *key,
-                                           struct rhash_head *obj,
-                                           struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+                                  struct rhash_lock_head __rcu **bkt,
+                                  struct bucket_table *tbl, unsigned int hash,
+                                  const void *key, struct rhash_head *obj)
 {
+       struct rhashtable_compare_arg arg = {
+               .ht = ht,
+               .key = key,
+       };
+       struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
-       unsigned int hash;
-       int err;
+       int elasticity;
+
+       elasticity = RHT_ELASTICITY;
+       rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *list;
+               struct rhlist_head *plist;
+
+               elasticity--;
+               if (!key ||
+                   (ht->p.obj_cmpfn ?
+                    ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+                    rhashtable_compare(&arg, rht_obj(ht, head)))) {
+                       pprev = &head->next;
+                       continue;
+               }
 
-       tbl = rhashtable_last_table(ht, tbl);
-       hash = head_hashfn(ht, tbl, obj);
-       spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+               if (!ht->rhlist)
+                       return rht_obj(ht, head);
 
-       err = -EEXIST;
-       if (key && rhashtable_lookup_fast(ht, key, ht->p))
-               goto exit;
+               list = container_of(obj, struct rhlist_head, rhead);
+               plist = container_of(head, struct rhlist_head, rhead);
 
-       err = -E2BIG;
-       if (unlikely(rht_grow_above_max(ht, tbl)))
-               goto exit;
+               RCU_INIT_POINTER(list->next, plist);
+               head = rht_dereference_bucket(head->next, tbl, hash);
+               RCU_INIT_POINTER(list->rhead.next, head);
+               if (pprev)
+                       rcu_assign_pointer(*pprev, obj);
+               else
+                       /* Need to preserve the bit lock */
+                       rht_assign_locked(bkt, obj);
+
+               return NULL;
+       }
+
+       if (elasticity <= 0)
+               return ERR_PTR(-EAGAIN);
+
+       return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+       struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+       struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+       void *data)
+{
+       struct bucket_table *new_tbl;
+       struct rhash_head *head;
+
+       if (!IS_ERR_OR_NULL(data))
+               return ERR_PTR(-EEXIST);
+
+       if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+               return ERR_CAST(data);
+
+       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (new_tbl)
+               return new_tbl;
+
+       if (PTR_ERR(data) != -ENOENT)
+               return ERR_CAST(data);
 
-       err = -EAGAIN;
-       if (rhashtable_check_elasticity(ht, tbl, hash) ||
-           rht_grow_above_100(ht, tbl))
-               goto exit;
+       if (unlikely(rht_grow_above_max(ht, tbl)))
+               return ERR_PTR(-E2BIG);
 
-       err = 0;
+       if (unlikely(rht_grow_above_100(ht, tbl)))
+               return ERR_PTR(-EAGAIN);
 
-       head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+       head = rht_ptr(bkt, tbl, hash);
 
        RCU_INIT_POINTER(obj->next, head);
+       if (ht->rhlist) {
+               struct rhlist_head *list;
 
-       rcu_assign_pointer(tbl->buckets[hash], obj);
+               list = container_of(obj, struct rhlist_head, rhead);
+               RCU_INIT_POINTER(list->next, NULL);
+       }
+
+       /* bkt is always the head of the list, so it holds
+        * the lock, which we need to preserve
+        */
+       rht_assign_locked(bkt, obj);
 
        atomic_inc(&ht->nelems);
+       if (rht_grow_above_75(ht, tbl))
+               schedule_work(&ht->run_work);
+
+       return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+                                  struct rhash_head *obj)
+{
+       struct bucket_table *new_tbl;
+       struct bucket_table *tbl;
+       struct rhash_lock_head __rcu **bkt;
+       unsigned int hash;
+       void *data;
+
+       new_tbl = rcu_dereference(ht->tbl);
+
+       do {
+               tbl = new_tbl;
+               hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+               if (rcu_access_pointer(tbl->future_tbl))
+                       /* Failure is OK */
+                       bkt = rht_bucket_var(tbl, hash);
+               else
+                       bkt = rht_bucket_insert(ht, tbl, hash);
+               if (bkt == NULL) {
+                       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+                       data = ERR_PTR(-EAGAIN);
+               } else {
+                       rht_lock(tbl, bkt);
+                       data = rhashtable_lookup_one(ht, bkt, tbl,
+                                                    hash, key, obj);
+                       new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+                                                       hash, obj, data);
+                       if (PTR_ERR(new_tbl) != -EEXIST)
+                               data = ERR_CAST(new_tbl);
+
+                       rht_unlock(tbl, bkt);
+               }
+       } while (!IS_ERR_OR_NULL(new_tbl));
+
+       if (PTR_ERR(data) == -EAGAIN)
+               data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+                              -EAGAIN);
+
+       return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+                            struct rhash_head *obj)
+{
+       void *data;
+
+       do {
+               rcu_read_lock();
+               data = rhashtable_try_insert(ht, key, obj);
+               rcu_read_unlock();
+       } while (PTR_ERR(data) == -EAGAIN);
 
-exit:
-       spin_unlock(rht_bucket_lock(tbl, hash));
+       return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-       if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht:                Table to walk over
+ * @iter:      Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+       iter->ht = ht;
+       iter->p = NULL;
+       iter->slot = 0;
+       iter->skip = 0;
+       iter->end_of_table = 0;
+
+       spin_lock(&ht->lock);
+       iter->walker.tbl =
+               rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+       list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+       spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:      Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+       spin_lock(&iter->ht->lock);
+       if (iter->walker.tbl)
+               list_del(&iter->walker.list);
+       spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter:      Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position.  Note that we take
+ * the RCU lock in all cases including when we return an error.  So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+       __acquires(RCU)
+{
+       struct rhashtable *ht = iter->ht;
+       bool rhlist = ht->rhlist;
+
+       rcu_read_lock();
+
+       spin_lock(&ht->lock);
+       if (iter->walker.tbl)
+               list_del(&iter->walker.list);
+       spin_unlock(&ht->lock);
+
+       if (iter->end_of_table)
+               return 0;
+       if (!iter->walker.tbl) {
+               iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+               iter->slot = 0;
+               iter->skip = 0;
+               return -EAGAIN;
+       }
+
+       if (iter->p && !rhlist) {
+               /*
+                * We need to validate that 'p' is still in the table, and
+                * if so, update 'skip'
+                */
+               struct rhash_head *p;
+               int skip = 0;
+               rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+                       skip++;
+                       if (p == iter->p) {
+                               iter->skip = skip;
+                               goto found;
+                       }
+               }
+               iter->p = NULL;
+       } else if (iter->p && rhlist) {
+               /* Need to validate that 'list' is still in the table, and
+                * if so, update 'skip' and 'p'.
+                */
+               struct rhash_head *p;
+               struct rhlist_head *list;
+               int skip = 0;
+               rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+                       for (list = container_of(p, struct rhlist_head, rhead);
+                            list;
+                            list = rcu_dereference(list->next)) {
+                               skip++;
+                               if (list == iter->list) {
+                                       iter->p = p;
+                                       iter->skip = skip;
+                                       goto found;
+                               }
+                       }
+               }
+               iter->p = NULL;
+       }
+found:
+       return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter:      Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+       struct bucket_table *tbl = iter->walker.tbl;
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+       bool rhlist = ht->rhlist;
+
+       if (!tbl)
                return NULL;
-       else if (err == -EAGAIN)
-               return tbl;
+
+       for (; iter->slot < tbl->size; iter->slot++) {
+               int skip = iter->skip;
+
+               rht_for_each_rcu(p, tbl, iter->slot) {
+                       if (rhlist) {
+                               list = container_of(p, struct rhlist_head,
+                                                   rhead);
+                               do {
+                                       if (!skip)
+                                               goto next;
+                                       skip--;
+                                       list = rcu_dereference(list->next);
+                               } while (list);
+
+                               continue;
+                       }
+                       if (!skip)
+                               break;
+                       skip--;
+               }
+
+next:
+               if (!rht_is_a_nulls(p)) {
+                       iter->skip++;
+                       iter->p = p;
+                       iter->list = list;
+                       return rht_obj(ht, rhlist ? &list->rhead : p);
+               }
+
+               iter->skip = 0;
+       }
+
+       iter->p = NULL;
+
+       /* Ensure we see any new tables. */
+       smp_rmb();
+
+       iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (iter->walker.tbl) {
+               iter->slot = 0;
+               iter->skip = 0;
+               return ERR_PTR(-EAGAIN);
+       } else {
+               iter->end_of_table = true;
+       }
+
+       return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:      Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+       bool rhlist = ht->rhlist;
+
+       if (p) {
+               if (!rhlist || !(list = rcu_dereference(list->next))) {
+                       p = rcu_dereference(p->next);
+                       list = container_of(p, struct rhlist_head, rhead);
+               }
+               if (!rht_is_a_nulls(p)) {
+                       iter->skip++;
+                       iter->p = p;
+                       iter->list = list;
+                       return rht_obj(ht, rhlist ? &list->rhead : p);
+               }
+
+               /* At the end of this slot, switch to next one and then find
+                * next entry from that point.
+                */
+               iter->skip = 0;
+               iter->slot++;
+       }
+
+       return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter:      Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+
+       if (p)
+               return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+       /* No object found in current iter, find next one in the table. */
+
+       if (iter->skip) {
+               /* A nonzero skip value points to the next entry in the table
+                * beyond that last one that was found. Decrement skip so
+                * we find the current value. __rhashtable_walk_find_next
+                * will restore the original value of skip assuming that
+                * the table hasn't changed.
+                */
+               iter->skip--;
+       }
+
+       return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:      Hash table iterator
+ *
+ * Finish a hash table walk.  Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+       __releases(RCU)
+{
+       struct rhashtable *ht;
+       struct bucket_table *tbl = iter->walker.tbl;
+
+       if (!tbl)
+               goto out;
+
+       ht = iter->ht;
+
+       spin_lock(&ht->lock);
+       if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+               /* This bucket table is being freed, don't re-link it. */
+               iter->walker.tbl = NULL;
        else
-               return ERR_PTR(err);
+               list_add(&iter->walker.list, &tbl->walkers);
+       spin_unlock(&ht->lock);
+
+out:
+       rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 {
-       return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-                  (unsigned long)params->min_size);
+       size_t retsize;
+
+       if (params->nelem_hint)
+               retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+                             (unsigned long)params->min_size);
+       else
+               retsize = max(HASH_DEFAULT_SIZE,
+                             (unsigned long)params->min_size);
+
+       return retsize;
 }
 
 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
@@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
        return jhash2(key, length, seed);
 }
 
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:                hash table to be initialized
+ * @params:    configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *     int                     key;
+ *     void *                  my_member;
+ *     struct rhash_head       node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *     .head_offset = offsetof(struct test_obj, node),
+ *     .key_offset = offsetof(struct test_obj, key),
+ *     .key_len = sizeof(int),
+ *     .hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *     [...]
+ *     struct rhash_head       node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ *     struct test_obj *obj = data;
+ *
+ *     return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *     .head_offset = offsetof(struct test_obj, node),
+ *     .hashfn = jhash,
+ *     .obj_hashfn = my_hash_fn,
+ * };
+ */
 int rhashtable_init(struct rhashtable *ht,
                    const struct rhashtable_params *params)
 {
        struct bucket_table *tbl;
        size_t size;
 
-       size = HASH_DEFAULT_SIZE;
-
        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;
 
-       if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-               return -EINVAL;
-
        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
@@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht,
        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);
 
-       if (params->max_size)
-               ht->p.max_size = rounddown_pow_of_two(params->max_size);
+       /* Cap total entries at 2^31 to avoid nelems overflow. */
+       ht->max_elems = 1u << 31;
 
-       if (params->insecure_max_entries)
-               ht->p.insecure_max_entries =
-                       rounddown_pow_of_two(params->insecure_max_entries);
-       else
-               ht->p.insecure_max_entries = ht->p.max_size * 2;
-
-       ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+       if (params->max_size) {
+               ht->p.max_size = rounddown_pow_of_two(params->max_size);
+               if (ht->p.max_size < ht->max_elems / 2)
+                       ht->max_elems = ht->p.max_size * 2;
+       }
 
-       if (params->nelem_hint)
-               size = rounded_hashtable_size(&ht->p);
-
-       /* The maximum (not average) chain length grows with the
-        * size of the hash table, at a rate of (log N)/(log log N).
-        * The value of 16 is selected so that even if the hash
-        * table grew to 2^32 you would not expect the maximum
-        * chain length to exceed it unless we are under attack
-        * (or extremely unlucky).
-        *
-        * As this limit is only to detect attacks, we don't need
-        * to set it to a lower value as you'd need the chain
-        * length to vastly exceed 16 to have any real effect
-        * on the system.
-        */
-       if (!params->insecure_elasticity)
-               ht->elasticity = 16;
+       ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
 
-       if (params->locks_mul)
-               ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-       else
-               ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+       size = rounded_hashtable_size(&ht->p);
 
        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
@@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht,
                }
        }
 
+       /*
+        * This is api initialization and thus we need to guarantee the
+        * initial rhashtable allocation. Upon failure, retry with the
+        * smallest possible size with __GFP_NOFAIL semantics.
+        */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-       if (tbl == NULL)
-               return -ENOMEM;
+       if (unlikely(tbl == NULL)) {
+               size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+               tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+       }
 
        atomic_set(&ht->nelems, 0);
 
@@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(rhashtable_init);
 
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:       hash list table to be initialized
+ * @params:    configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
-       struct bucket_table *tbl;
+       int err;
+
+       err = rhashtable_init(&hlt->ht, params);
+       hlt->ht.rhlist = true;
+       return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+                               void (*free_fn)(void *ptr, void *arg),
+                               void *arg)
+{
+       struct rhlist_head *list;
+
+       if (!ht->rhlist) {
+               free_fn(rht_obj(ht, obj), arg);
+               return;
+       }
+
+       list = container_of(obj, struct rhlist_head, rhead);
+       do {
+               obj = &list->rhead;
+               list = rht_dereference(list->next, ht);
+               free_fn(rht_obj(ht, obj), arg);
+       } while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht:                the hash table to destroy
+ * @free_fn:   callback to release resources of element
+ * @arg:       pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+                                void (*free_fn)(void *ptr, void *arg),
+                                void *arg)
+{
+       struct bucket_table *tbl, *next_tbl;
+       unsigned int i;
 
        cancel_work_sync(&ht->run_work);
 
        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
+restart:
+       if (free_fn) {
+               for (i = 0; i < tbl->size; i++) {
+                       struct rhash_head *pos, *next;
+
+                       cond_resched();
+                       for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+                            next = !rht_is_a_nulls(pos) ?
+                                       rht_dereference(pos->next, ht) : NULL;
+                            !rht_is_a_nulls(pos);
+                            pos = next,
+                            next = !rht_is_a_nulls(pos) ?
+                                       rht_dereference(pos->next, ht) : NULL)
+                               rhashtable_free_one(ht, pos, free_fn, arg);
+               }
+       }
+
+       next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
+       if (next_tbl) {
+               tbl = next_tbl;
+               goto restart;
+       }
        mutex_unlock(&ht->mutex);
 }
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+       return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       unsigned int index = hash & ((1 << tbl->nest) - 1);
+       unsigned int size = tbl->size >> tbl->nest;
+       unsigned int subhash = hash;
+       union nested_table *ntbl;
+
+       ntbl = nested_table_top(tbl);
+       ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+       subhash >>= tbl->nest;
+
+       while (ntbl && size > (1 << shift)) {
+               index = subhash & ((1 << shift) - 1);
+               ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+                                                 tbl, hash);
+               size >>= shift;
+               subhash >>= shift;
+       }
+
+       if (!ntbl)
+               return NULL;
+
+       return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       static struct rhash_lock_head __rcu *rhnull;
+
+       if (!rhnull)
+               INIT_RHT_NULLS_HEAD(rhnull);
+       return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       unsigned int index = hash & ((1 << tbl->nest) - 1);
+       unsigned int size = tbl->size >> tbl->nest;
+       union nested_table *ntbl;
+
+       ntbl = nested_table_top(tbl);
+       hash >>= tbl->nest;
+       ntbl = nested_table_alloc(ht, &ntbl[index].table,
+                                 size <= (1 << shift));
+
+       while (ntbl && size > (1 << shift)) {
+               index = hash & ((1 << shift) - 1);
+               size >>= shift;
+               hash >>= shift;
+               ntbl = nested_table_alloc(ht, &ntbl[index].table,
+                                         size <= (1 << shift));
+       }
+
+       if (!ntbl)
+               return NULL;
+
+       return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
index 532800443398ccef305ada1287bc781116b2aa47..fe721891a23852c49eeb8fc56cdb37ea6e5e5674 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/six.h>
+#include <linux/slab.h>
 
 #ifdef DEBUG
 #define EBUG_ON(cond)          BUG_ON(cond)
@@ -309,6 +310,9 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
                                wake_up_process(p);
                }
 
+               if (ret)
+                       six_acquire(&lock->dep_map, 1);
+
                return ret;
        }
 
@@ -560,6 +564,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
            lock->readers) {
                smp_mb(); /* unlock barrier */
                this_cpu_dec(*lock->readers);
+               smp_mb(); /* between unlocking and checking for waiters */
                state.v = READ_ONCE(lock->state.v);
        } else {
                EBUG_ON(!(lock->state.v & l[type].held_mask));
@@ -705,6 +710,34 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
+struct free_pcpu_rcu {
+       struct rcu_head         rcu;
+       void __percpu           *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+       struct free_pcpu_rcu *rcu =
+               container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+       free_percpu(rcu->p);
+       kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+       struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+       if (!rcu)
+               return;
+
+       rcu->p = lock->readers;
+       lock->readers = NULL;
+
+       call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
 void six_lock_pcpu_free(struct six_lock *lock)
 {
        BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -717,8 +750,6 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
 
 void six_lock_pcpu_alloc(struct six_lock *lock)
 {
-       BUG_ON(lock->readers && pcpu_read_count(lock));
-       BUG_ON(lock->state.read_lock);
 #ifdef __KERNEL__
        if (!lock->readers)
                lock->readers = alloc_percpu(unsigned);