]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 99750eab4d bcachefs: Persist stripe blocks_used
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 23 Jan 2019 20:49:44 +0000 (15:49 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Wed, 23 Jan 2019 20:53:32 +0000 (15:53 -0500)
37 files changed:
.bcachefs_revision
include/asm/page.h [new file with mode: 0644]
include/linux/atomic.h
include/linux/generic-radix-tree.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bkey.c
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_update.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/chardev.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/ec_types.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/eytzinger.h
libbcachefs/fs-io.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/util.c
libbcachefs/util.h
linux/generic-radix-tree.c

index 088f645c750a9757612fc963fb3505e651a283b3..8eca05930b099268f419ee266d226d8ce7c08777 100644 (file)
@@ -1 +1 @@
-bcca1c557b1897ecc3aeb1f89ab91865487d91ab
+99750eab4d583132cf61f071082c7cf21f5295c0
diff --git a/include/asm/page.h b/include/asm/page.h
new file mode 100644 (file)
index 0000000..e69de29
index 7471bd976dc244c75144d54329db3f4107a5cb3a..38a364c07c1b619668f7823fa9d546944022369e 100644 (file)
@@ -37,6 +37,7 @@ typedef struct {
 #define xchg_acquire(p, v)             uatomic_xchg(p, v)
 #define cmpxchg(p, old, new)           uatomic_cmpxchg(p, old, new)
 #define cmpxchg_acquire(p, old, new)   uatomic_cmpxchg(p, old, new)
+#define cmpxchg_release(p, old, new)   uatomic_cmpxchg(p, old, new)
 
 #define smp_mb__before_atomic()                cmm_smp_mb__before_uatomic_add()
 #define smp_mb__after_atomic()         cmm_smp_mb__after_uatomic_add()
@@ -77,6 +78,16 @@ typedef struct {
        __old;                                                  \
 })
 
+#define cmpxchg_release(p, old, new)                           \
+({                                                             \
+       typeof(*(p)) __old = (old);                             \
+                                                               \
+       __atomic_compare_exchange_n((p), &__old, new, false,    \
+                                   __ATOMIC_RELEASE,           \
+                                   __ATOMIC_RELEASE);          \
+       __old;                                                  \
+})
+
 #define smp_mb__before_atomic()        __atomic_thread_fence(__ATOMIC_SEQ_CST)
 #define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
 #define smp_wmb()              __atomic_thread_fence(__ATOMIC_SEQ_CST)
index 7f637e17bfed4f4baea3522d026232c956e2cb12..3a91130a4fbd54040301f7da2baf4a242b14ab67 100644 (file)
@@ -1,34 +1,60 @@
 #ifndef _LINUX_GENERIC_RADIX_TREE_H
 #define _LINUX_GENERIC_RADIX_TREE_H
 
-/*
- * Generic radix trees/sparse arrays:
+/**
+ * DOC: Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
  *
- * A generic radix tree has all nodes of size PAGE_SIZE - both leaves and
- * interior nodes.
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
  */
 
+#include <asm/page.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 
-struct genradix_node;
+struct genradix_root;
 
 struct __genradix {
-       struct genradix_node            *root;
-       size_t                          depth;
+       struct genradix_root __rcu      *root;
 };
 
 /*
- * NOTE: currently, sizeof(_type) must be a power of two and not larger than
- * PAGE_SIZE:
+ * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
  */
 
 #define __GENRADIX_INITIALIZER                                 \
        {                                                       \
                .tree = {                                       \
                        .root = NULL,                           \
-                       .depth = 0,                             \
                }                                               \
        }
 
@@ -49,6 +75,12 @@ struct {                                                     \
 #define DEFINE_GENRADIX(_name, _type)                          \
        GENRADIX(_type) _name = __GENRADIX_INITIALIZER
 
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:    genradix to initialize
+ *
+ * Does not fail
+ */
 #define genradix_init(_radix)                                  \
 do {                                                           \
        *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;   \
@@ -56,11 +88,20 @@ do {                                                                \
 
 void __genradix_free(struct __genradix *);
 
+/**
+ * genradix_free: free all memory owned by a genradix
+ * @_radix: the genradix to free
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
 #define genradix_free(_radix)  __genradix_free(&(_radix)->tree)
 
 static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 {
-       BUILD_BUG_ON(obj_size > PAGE_SIZE);
+       if (__builtin_constant_p(obj_size))
+               BUILD_BUG_ON(obj_size > PAGE_SIZE);
+       else
+               BUG_ON(obj_size > PAGE_SIZE);
 
        if (!is_power_of_2(obj_size)) {
                size_t objs_per_page = PAGE_SIZE / obj_size;
@@ -79,7 +120,13 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 void *__genradix_ptr(struct __genradix *, size_t);
 
-/* Returns a pointer to element at @_idx */
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix:    genradix to access
+ * @_idx:      index to fetch
+ *
+ * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
+ */
 #define genradix_ptr(_radix, _idx)                             \
        (__genradix_cast(_radix)                                \
         __genradix_ptr(&(_radix)->tree,                        \
@@ -87,7 +134,15 @@ void *__genradix_ptr(struct __genradix *, size_t);
 
 void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 
-/* Returns a pointer to element at @_idx, allocating it if necessary */
+/**
+ * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
+ *                     if necessary
+ * @_radix:    genradix to access
+ * @_idx:      index to fetch
+ * @_gfp:      gfp mask
+ *
+ * Returns a pointer to entry at @_idx, or NULL on allocation failure
+ */
 #define genradix_ptr_alloc(_radix, _idx, _gfp)                 \
        (__genradix_cast(_radix)                                \
         __genradix_ptr_alloc(&(_radix)->tree,                  \
@@ -99,6 +154,11 @@ struct genradix_iter {
        size_t                  pos;
 };
 
+/**
+ * genradix_iter_init - initialize a genradix_iter
+ * @_radix:    genradix that will be iterated over
+ * @_idx:      index to start iterating from
+ */
 #define genradix_iter_init(_radix, _idx)                       \
        ((struct genradix_iter) {                               \
                .pos    = (_idx),                               \
@@ -107,6 +167,14 @@ struct genradix_iter {
 
 void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 
+/**
+ * genradix_iter_peek - get first entry at or above iterator's current
+ *                     position
+ * @_iter:     a genradix_iter
+ * @_radix:    genradix being iterated over
+ *
+ * If no more entries exist at or above @_iter's current position, returns NULL
+ */
 #define genradix_iter_peek(_iter, _radix)                      \
        (__genradix_cast(_radix)                                \
         __genradix_iter_peek(_iter, &(_radix)->tree,           \
@@ -127,4 +195,37 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)                   \
        __genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+#define genradix_for_each_from(_radix, _iter, _p, _start)      \
+       for (_iter = genradix_iter_init(_radix, _start);        \
+            (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
+            genradix_iter_advance(&_iter, _radix))
+
+/**
+ * genradix_for_each - iterate over entry in a genradix
+ * @_radix:    genradix to iterate over
+ * @_iter:     a genradix_iter to track current position
+ * @_p:                pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each(_radix, _iter, _p)                   \
+       genradix_for_each_from(_radix, _iter, _p, 0)
+
+int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_prealloc - preallocate entries in a generic radix tree
+ * @_radix:    genradix to preallocate
+ * @_nr:       number of entries to preallocate
+ * @_gfp:      gfp mask
+ *
+ * Returns 0 on success, -ENOMEM on failure
+ */
+#define genradix_prealloc(_radix, _nr, _gfp)                   \
+        __genradix_prealloc(&(_radix)->tree,                   \
+                       __genradix_idx_to_offset(_radix, _nr + 1),\
+                       _gfp)
+
+
 #endif /* _LINUX_GENERIC_RADIX_TREE_H */
index 6de6e26384b21e3135e658540c7b9651da155144..2552d45799ca08ce2c96ecc4ab00288c6a169140 100644 (file)
@@ -249,6 +249,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
                                bch2_alloc_read_key(c, bkey_i_to_s_c(k));
        }
 
+       for_each_member_device(ca, c, i)
+               bch2_dev_usage_from_buckets(c, ca);
+
        mutex_lock(&c->bucket_clock[READ].lock);
        for_each_member_device(ca, c, i) {
                down_read(&ca->bucket_lock);
@@ -280,35 +283,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 #endif
        struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
        struct bucket *g;
-       struct bucket_mark m;
+       struct bucket_mark m, new;
        int ret;
 
        BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
        a->k.p = POS(ca->dev_idx, b);
 
+       bch2_btree_iter_set_pos(iter, a->k.p);
+
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               return ret;
+
        percpu_down_read_preempt_disable(&c->mark_lock);
        g = bucket(ca, b);
-       m = bucket_cmpxchg(g, m, m.dirty = false);
+       m = READ_ONCE(g->mark);
+
+       if (!m.dirty) {
+               percpu_up_read_preempt_enable(&c->mark_lock);
+               return 0;
+       }
 
        __alloc_write_key(a, g, m);
        percpu_up_read_preempt_enable(&c->mark_lock);
 
        bch2_btree_iter_cond_resched(iter);
 
-       bch2_btree_iter_set_pos(iter, a->k.p);
-
        ret = bch2_btree_insert_at(c, NULL, journal_seq,
+                                  BTREE_INSERT_NOCHECK_RW|
                                   BTREE_INSERT_NOFAIL|
                                   BTREE_INSERT_USE_RESERVE|
                                   BTREE_INSERT_USE_ALLOC_RESERVE|
                                   flags,
                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
+       if (ret)
+               return ret;
+
+       new = m;
+       new.dirty = false;
+       atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
 
-       if (!ret && ca->buckets_written)
+       if (ca->buckets_written)
                set_bit(b, ca->buckets_written);
 
-       return ret;
+       return 0;
 }
 
 int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
@@ -898,10 +917,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
                for (i = 0; i < RESERVE_NR; i++)
                        if (fifo_push(&ca->free[i], bucket)) {
                                fifo_pop(&ca->free_inc, bucket);
+
                                closure_wake_up(&c->freelist_wait);
+                               ca->allocator_blocked_full = false;
+
                                spin_unlock(&c->freelist_lock);
                                goto out;
                        }
+
+               if (!ca->allocator_blocked_full) {
+                       ca->allocator_blocked_full = true;
+                       closure_wake_up(&c->freelist_wait);
+               }
+
                spin_unlock(&c->freelist_lock);
 
                if ((current->flags & PF_KTHREAD) &&
@@ -1226,6 +1254,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
                        set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
+{
+       closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+}
+
 /* stop allocator thread: */
 void bch2_dev_allocator_stop(struct bch_dev *ca)
 {
@@ -1333,6 +1366,24 @@ static void allocator_start_issue_discards(struct bch_fs *c)
                                             ca->mi.bucket_size, GFP_NOIO, 0);
 }
 
+static int resize_free_inc(struct bch_dev *ca)
+{
+       alloc_fifo free_inc;
+
+       if (!fifo_full(&ca->free_inc))
+               return 0;
+
+       if (!init_fifo(&free_inc,
+                      ca->free_inc.size * 2,
+                      GFP_KERNEL))
+               return -ENOMEM;
+
+       fifo_move(&free_inc, &ca->free_inc);
+       swap(free_inc, ca->free_inc);
+       free_fifo(&free_inc);
+       return 0;
+}
+
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
@@ -1408,6 +1459,12 @@ not_enough:
 
                        while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
                               (bu = next_alloc_bucket(ca)) >= 0) {
+                               ret = resize_free_inc(ca);
+                               if (ret) {
+                                       percpu_ref_put(&ca->io_ref);
+                                       return ret;
+                               }
+
                                bch2_invalidate_one_bucket(c, ca, bu,
                                                           &journal_seq);
 
index a0c08e347ad7d7469b4a577c59b82f281d0366cc..26561b3bafb8336cfde59bcea506d77795064923 100644 (file)
@@ -51,6 +51,7 @@ void bch2_recalc_capacity(struct bch_fs *);
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
index 14e6453bc3dab0455d0f60ccc184795789c43960..f2f9015dbb00ca144fe0eb2cf31d2f4e53f2b909 100644 (file)
@@ -106,6 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
                               false, gc_pos_alloc(c, ob), 0);
        ob->valid = false;
+       ob->type = 0;
 
        spin_unlock(&ob->lock);
        percpu_up_read_preempt_enable(&c->mark_lock);
@@ -141,6 +142,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
        ob = c->open_buckets + c->open_buckets_freelist;
        c->open_buckets_freelist = ob->freelist;
        atomic_set(&ob->pin, 1);
+       ob->type = 0;
 
        c->open_buckets_nr_free--;
        return ob;
@@ -209,9 +211,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
        case RESERVE_ALLOC:
                return 0;
        case RESERVE_BTREE:
-               return BTREE_NODE_RESERVE / 2;
+               return BTREE_NODE_OPEN_BUCKET_RESERVE;
        default:
-               return BTREE_NODE_RESERVE;
+               return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
        }
 }
 
@@ -837,15 +839,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 {
        struct write_point *wp;
        struct open_bucket *ob;
-       unsigned nr_effective = 0;
-       struct open_buckets ptrs = { .nr = 0 };
-       bool have_cache = false;
-       unsigned write_points_nr;
-       int ret = 0, i;
+       struct open_buckets ptrs;
+       unsigned nr_effective, write_points_nr;
+       bool have_cache;
+       int ret, i;
 
        BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
+       ptrs.nr         = 0;
+       nr_effective    = 0;
        write_points_nr = c->write_points_nr;
+       have_cache      = false;
 
        wp = writepoint_find(c, write_point.v);
 
index b0e44f758d7f368cc8e9b751d55786f7a28aa0ec..5224a52f8bebbee3731d75f9567b7f55e9331b63 100644 (file)
@@ -85,6 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
        unsigned i;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
+               ob->type = wp->type;
                atomic_inc(&ob->pin);
                ob_push(c, ptrs, ob);
        }
index 6f17f094c21e47f6547d61fed45097c0f766649e..66457fc722fd5ec2cbb1ec9312e515661ae5f7b8 100644 (file)
@@ -55,9 +55,10 @@ struct open_bucket {
        spinlock_t              lock;
        atomic_t                pin;
        u8                      freelist;
-       bool                    valid;
-       bool                    on_partial_list;
        u8                      ec_idx;
+       u8                      type;
+       unsigned                valid:1;
+       unsigned                on_partial_list:1;
        unsigned                sectors_free;
        struct bch_extent_ptr   ptr;
        struct ec_stripe_new    *ec;
index 449eb0c1ce6116e682c38a0a64c8da622fd2e6d7..f42b2f9052c636286d435f7a96957e8405ecfec8 100644 (file)
@@ -330,6 +330,8 @@ enum bch_time_stats {
 /* Size of the freelist we allocate btree nodes from: */
 #define BTREE_NODE_RESERVE     BTREE_RESERVE_MAX
 
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
 struct btree;
 
 enum gc_phase {
@@ -426,7 +428,13 @@ struct bch_dev {
 
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
+
+       /*
+        * XXX: this should be an enum for allocator state, so as to include
+        * error state
+        */
        bool                    allocator_blocked;
+       bool                    allocator_blocked_full;
 
        alloc_heap              alloc_heap;
 
@@ -597,6 +605,7 @@ struct bch_fs {
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
+       struct workqueue_struct *journal_reclaim_wq;
 
        /* ALLOCATION */
        struct delayed_work     pd_controllers_update;
index 25725e423055c7228dae8bb443a195892faee237..40ce33a40d7927e85b7d4ce7476487ed66ac106f 100644 (file)
@@ -1010,11 +1010,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
                        nr_key_bits -= 64;
                }
 
-               if (l_v != r_v)
-                       return l_v < r_v ? -1 : 1;
-
-               if (!nr_key_bits)
-                       return 0;
+               if (!nr_key_bits || l_v != r_v)
+                       break;
 
                l = next_word(l);
                r = next_word(r);
@@ -1022,6 +1019,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
                l_v = *l;
                r_v = *r;
        }
+
+       return (l_v > r_v) - (l_v < r_v);
 }
 #endif
 
index 23013fbb6fb2b17270e332357096fe9f92215566..433e8f22fd1904607dd168e515c2c866e3d4bf63 100644 (file)
@@ -483,31 +483,6 @@ static void bch2_gc_free(struct bch_fs *c)
        percpu_up_write(&c->mark_lock);
 }
 
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-       u64 *ret;
-       int cpu;
-
-       preempt_disable();
-       ret = this_cpu_ptr(p);
-       preempt_enable();
-
-       for_each_possible_cpu(cpu) {
-               u64 *i = per_cpu_ptr(p, cpu);
-
-               if (i != ret) {
-                       acc_u64s(ret, i, nr);
-                       memset(i, 0, nr * sizeof(u64));
-               }
-       }
-
-       return ret;
-}
-
 static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
        struct bch_dev *ca;
@@ -543,9 +518,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
        for_each_member_device(ca, c, i) {
                unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
                struct bch_dev_usage *dst = (void *)
-                       acc_percpu_u64s((void *) ca->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
                struct bch_dev_usage *src = (void *)
-                       acc_percpu_u64s((void *) ca->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 
                *dst = *src;
        }
@@ -554,9 +529,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
                unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
                        c->replicas.nr;
                struct bch_fs_usage *dst = (void *)
-                       acc_percpu_u64s((void *) c->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[0], nr);
                struct bch_fs_usage *src = (void *)
-                       acc_percpu_u64s((void *) c->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
                memcpy(&dst->s.gc_start[0],
                       &src->s.gc_start[0],
@@ -582,6 +557,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
                        dst_iter.pos, ##__VA_ARGS__,                    \
                        dst->_f, src->_f);                              \
                dst->_f = src->_f;                                      \
+               dst->dirty = true;                                      \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
@@ -612,16 +588,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
                while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
                       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+                       BUG_ON(src_iter.pos != dst_iter.pos);
+
                        copy_stripe_field(alive,        "alive");
                        copy_stripe_field(sectors,      "sectors");
                        copy_stripe_field(algorithm,    "algorithm");
                        copy_stripe_field(nr_blocks,    "nr_blocks");
                        copy_stripe_field(nr_redundant, "nr_redundant");
-                       copy_stripe_field(blocks_nonempty.counter,
+                       copy_stripe_field(blocks_nonempty,
                                          "blocks_nonempty");
 
                        for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
-                               copy_stripe_field(block_sectors[i].counter,
+                               copy_stripe_field(block_sectors[i],
                                                  "block_sectors[%u]", i);
 
                        if (dst->alive)
@@ -656,9 +634,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
        for_each_member_device(ca, c, i) {
                unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
                struct bch_dev_usage *dst = (void *)
-                       acc_percpu_u64s((void *) ca->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
                struct bch_dev_usage *src = (void *)
-                       acc_percpu_u64s((void *) ca->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
                unsigned b;
 
                for (b = 0; b < BCH_DATA_NR; b++)
@@ -678,9 +656,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
                unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
                        c->replicas.nr;
                struct bch_fs_usage *dst = (void *)
-                       acc_percpu_u64s((void *) c->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[0], nr);
                struct bch_fs_usage *src = (void *)
-                       acc_percpu_u64s((void *) c->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
                copy_fs_field(s.hidden,         "hidden");
                copy_fs_field(s.data,           "data");
index 8af5f841a537f8f33271671a15b51ba4516a2349..1905acfa028a535f82dbdc43e0e408323bacf96d 100644 (file)
@@ -109,7 +109,7 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 
        do {
                seq = read_seqcount_begin(&c->gc_pos_lock);
-               ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+               ret = gc_pos_cmp(pos, c->gc_pos) < 0;
        } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
        return ret;
index dd9d255952d0821a04063cfb5a9231047f79ea90..4bd0725846d002229748fecd387577a0d02b209e 100644 (file)
@@ -77,6 +77,7 @@ enum {
        __BTREE_INSERT_ATOMIC,
        __BTREE_INSERT_NOUNLOCK,
        __BTREE_INSERT_NOFAIL,
+       __BTREE_INSERT_NOCHECK_RW,
        __BTREE_INSERT_USE_RESERVE,
        __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
@@ -100,6 +101,8 @@ enum {
 /* Don't check for -ENOSPC: */
 #define BTREE_INSERT_NOFAIL            (1 << __BTREE_INSERT_NOFAIL)
 
+#define BTREE_INSERT_NOCHECK_RW                (1 << __BTREE_INSERT_NOCHECK_RW)
+
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
 #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
index 7eca9203be01a1e54c8e62c2951d56a410aebc3c..0df894fcf1ae67682760a7948c9175f0cefefad7 100644 (file)
@@ -628,7 +628,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
        trans_for_each_entry(trans, i)
                btree_insert_entry_checks(c, i);
 
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
+       if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+                    !percpu_ref_tryget(&c->writes)))
                return -EROFS;
 retry:
        trans_for_each_iter(trans, i) {
@@ -658,7 +659,8 @@ retry:
        trans_for_each_iter(trans, i)
                bch2_btree_iter_downgrade(i->iter);
 out:
-       percpu_ref_put(&c->writes);
+       if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+               percpu_ref_put(&c->writes);
 
        /* make sure we didn't drop or screw up locks: */
        trans_for_each_iter(trans, i) {
index d33d0bf048f07b6cb8acb5be7361e1036b0b152c..ea71acb5f8cfc6cb4b0d74b34ec9557542a51605 100644 (file)
@@ -151,7 +151,6 @@ retry:
        acc_u64s_percpu((u64 *) ret,
                        (u64 __percpu *) c->usage[0],
                        sizeof(*ret) / sizeof(u64) + nr);
-       percpu_up_read_preempt_enable(&c->mark_lock);
 
        return ret;
 }
@@ -223,13 +222,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
               !is_available_bucket(new);
 }
 
-void bch2_fs_usage_apply(struct bch_fs *c,
-                        struct bch_fs_usage *fs_usage,
-                        struct disk_reservation *disk_res,
-                        struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+                       struct bch_fs_usage *fs_usage,
+                       struct disk_reservation *disk_res,
+                       struct gc_pos gc_pos)
 {
        s64 added = fs_usage->s.data + fs_usage->s.reserved;
        s64 should_not_have_added;
+       int ret = 0;
 
        percpu_rwsem_assert_held(&c->mark_lock);
 
@@ -242,6 +242,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
                      "disk usage increased without a reservation")) {
                atomic64_sub(should_not_have_added, &c->sectors_available);
                added -= should_not_have_added;
+               ret = -1;
        }
 
        if (added > 0) {
@@ -259,6 +260,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
                         (u64 *) fs_usage,
                         sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
        }
+
+       return ret;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -363,10 +366,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
 {
        struct bch_replicas_padded r;
 
-       r.e.data_type   = BCH_DATA_CACHED;
-       r.e.nr_devs     = 1;
-       r.e.nr_required = 1;
-       r.e.devs[0]     = dev;
+       bch2_replicas_entry_cached(&r.e, dev);
 
        update_replicas(c, fs_usage, &r.e, sectors);
 }
@@ -382,7 +382,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
        *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
                BUG_ON(!is_available_bucket(new));
 
-               new.owned_by_allocator  = 1;
+               new.owned_by_allocator  = true;
+               new.dirty               = true;
                new.data_type           = 0;
                new.cached_sectors      = 0;
                new.dirty_sectors       = 0;
@@ -455,6 +456,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
               type != BCH_DATA_JOURNAL);
 
        bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+               new.dirty       = true;
                new.data_type   = type;
                checked_add(new.dirty_sectors, sectors);
        }));
@@ -480,13 +482,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                                                    true);
        } else {
                struct bucket *g;
-               struct bucket_mark old, new;
+               struct bucket_mark new;
 
                rcu_read_lock();
 
                g = bucket(ca, b);
-               old = bucket_cmpxchg(g, new, ({
-                       new.data_type = type;
+               bucket_cmpxchg(g, new, ({
+                       new.dirty       = true;
+                       new.data_type   = type;
                        checked_add(new.dirty_sectors, sectors);
                }));
 
@@ -537,6 +540,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
        do {
                new.v.counter = old.v.counter = v;
 
+               new.dirty = true;
+
                /*
                 * Check this after reading bucket mark to guard against
                 * the allocator invalidating a bucket after we've already
@@ -591,9 +596,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
        int blocks_nonempty_delta;
        s64 parity_sectors;
 
+       BUG_ON(!sectors);
+
        m = genradix_ptr(&c->stripes[gc], p.idx);
 
+       spin_lock(&c->ec_stripes_heap_lock);
+
        if (!m || !m->alive) {
+               spin_unlock(&c->ec_stripes_heap_lock);
                bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
                                    (u64) p.idx);
                return -1;
@@ -609,19 +619,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
                parity_sectors = -parity_sectors;
        sectors += parity_sectors;
 
-       new = atomic_add_return(sectors, &m->block_sectors[p.block]);
-       old = new - sectors;
+       old = m->block_sectors[p.block];
+       m->block_sectors[p.block] += sectors;
+       new = m->block_sectors[p.block];
 
        blocks_nonempty_delta = (int) !!new - (int) !!old;
-       if (!blocks_nonempty_delta)
-               return 0;
+       if (blocks_nonempty_delta) {
+               m->blocks_nonempty += blocks_nonempty_delta;
 
-       atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+               if (!gc)
+                       bch2_stripes_heap_update(c, m, p.idx);
+       }
 
-       BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+       m->dirty = true;
 
-       if (!gc)
-               bch2_stripes_heap_update(c, m, p.idx);
+       spin_unlock(&c->ec_stripes_heap_lock);
 
        update_replicas(c, fs_usage, &m->r.e, sectors);
 
@@ -629,8 +641,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 }
 
 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
-                           s64 sectors,
-                           enum bch_data_type data_type,
+                           s64 sectors, enum bch_data_type data_type,
                            struct bch_fs_usage *fs_usage,
                            unsigned journal_seq, unsigned flags,
                            bool gc)
@@ -701,14 +712,13 @@ static void bucket_set_stripe(struct bch_fs *c,
                BUG_ON(ptr_stale(ca, ptr));
 
                old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+                       new.dirty                       = true;
                        new.stripe                      = enabled;
                        if (journal_seq) {
                                new.journal_seq_valid   = 1;
                                new.journal_seq         = journal_seq;
                        }
                }));
-
-               BUG_ON(old.stripe == enabled);
        }
 }
 
@@ -723,22 +733,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
        struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
        unsigned i;
 
+       spin_lock(&c->ec_stripes_heap_lock);
+
        if (!m || (!inserting && !m->alive)) {
+               spin_unlock(&c->ec_stripes_heap_lock);
                bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
                                    idx);
                return -1;
        }
 
-       if (inserting && m->alive) {
-               bch_err_ratelimited(c, "error marking stripe %zu: already exists",
-                                   idx);
-               return -1;
-       }
-
-       BUG_ON(atomic_read(&m->blocks_nonempty));
+       if (m->alive)
+               bch2_stripes_heap_del(c, m, idx);
 
-       for (i = 0; i < EC_STRIPE_MAX; i++)
-               BUG_ON(atomic_read(&m->block_sectors[i]));
+       memset(m, 0, sizeof(*m));
 
        if (inserting) {
                m->sectors      = le16_to_cpu(s.v->sectors);
@@ -754,7 +761,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
                for (i = 0; i < s.v->nr_blocks; i++)
                        m->r.e.devs[i] = s.v->ptrs[i].dev;
-       }
 
        /*
         * XXX: account for stripes somehow here
@@ -763,15 +769,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
        update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
 #endif
 
-       if (!gc) {
-               if (inserting)
+               /* gc recalculates these fields: */
+               if (!(flags & BCH_BUCKET_MARK_GC)) {
+                       for (i = 0; i < s.v->nr_blocks; i++) {
+                               m->block_sectors[i] =
+                                       stripe_blockcount_get(s.v, i);
+                               m->blocks_nonempty += !!m->block_sectors[i];
+                       }
+               }
+
+               if (!gc)
                        bch2_stripes_heap_insert(c, m, idx);
                else
-                       bch2_stripes_heap_del(c, m, idx);
-       } else {
-               m->alive = inserting;
+                       m->alive = true;
        }
 
+       spin_unlock(&c->ec_stripes_heap_lock);
+
        bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
        return 0;
 }
@@ -879,6 +893,8 @@ void bch2_mark_update(struct btree_insert *trans,
        struct bch_fs_usage     *fs_usage;
        struct gc_pos           pos = gc_pos_btree_node(b);
        struct bkey_packed      *_k;
+       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+       static int warned_disk_usage = 0;
 
        if (!btree_node_type_needs_gc(iter->btree_id))
                return;
@@ -939,7 +955,37 @@ void bch2_mark_update(struct btree_insert *trans,
                bch2_btree_node_iter_advance(&node_iter, b);
        }
 
-       bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
+       if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+           !warned_disk_usage &&
+           !xchg(&warned_disk_usage, 1)) {
+               char buf[200];
+
+               pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+               pr_err("while inserting");
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+               pr_err("%s", buf);
+               pr_err("overlapping with");
+
+               node_iter = iter->l[0].iter;
+               while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+                                                             KEY_TYPE_discard))) {
+                       struct bkey             unpacked;
+                       struct bkey_s_c         k;
+
+                       k = bkey_disassemble(b, _k, &unpacked);
+
+                       if (btree_node_is_extents(b)
+                           ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+                           : bkey_cmp(insert->k->k.p, k.k->p))
+                               break;
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       pr_err("%s", buf);
+
+                       bch2_btree_node_iter_advance(&node_iter, b);
+               }
+       }
 
        percpu_up_read_preempt_enable(&c->mark_lock);
 }
index ebd39e85fad4c6aeb31d04ab0f5592b34d135f71..6f3681728f0a3029016f9209dc64d2e40d199d99 100644 (file)
@@ -181,6 +181,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
+void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
+
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
                                          struct bch_dev_usage stats)
 {
@@ -264,8 +266,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
                  bool, s64, struct gc_pos,
                  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-                        struct disk_reservation *, struct gc_pos);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+                       struct disk_reservation *, struct gc_pos);
 
 /* disk reservations: */
 
index 56ceb260b2bc6840a8c55135060ec15fb219a343..b84ae5c937e5ebae81c769279adb2d311d25efd3 100644 (file)
@@ -402,6 +402,8 @@ static long bch2_ioctl_usage(struct bch_fs *c,
                if (!src)
                        return -ENOMEM;
 
+               percpu_up_read_preempt_enable(&c->mark_lock);
+
                dst.used                = bch2_fs_sectors_used(c, *src);
                dst.online_reserved     = src->s.online_reserved;
 
index 755a26039cccf8c88d67c3065ba0797d8bdc89f7..8018c2bc2b5e6111e0ecf3a62558d2a75fe2877d 100644 (file)
@@ -11,6 +11,7 @@
 #include "ec.h"
 #include "error.h"
 #include "io.h"
+#include "journal_io.h"
 #include "keylist.h"
 #include "super-io.h"
 #include "util.h"
@@ -98,40 +99,6 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-static unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
-       return DIV_ROUND_UP(le16_to_cpu(s->sectors),
-                           1 << s->csum_granularity_bits);
-}
-
-static unsigned stripe_csum_offset(const struct bch_stripe *s,
-                                  unsigned dev, unsigned csum_idx)
-{
-       unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
-       return sizeof(struct bch_stripe) +
-               sizeof(struct bch_extent_ptr) * s->nr_blocks +
-               (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
-                                        unsigned idx)
-{
-       return stripe_csum_offset(s, s->nr_blocks, 0) +
-               sizeof(16) * idx;
-}
-
-static unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
-       return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
-                           sizeof(u64));
-}
-
-static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
-{
-       return (void *) s + stripe_csum_offset(s, dev, csum_idx);
-}
-
 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -164,8 +131,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
               1U << s->csum_granularity_bits);
 
        for (i = 0; i < s->nr_blocks; i++)
-               pr_buf(out, " %u:%llu", s->ptrs[i].dev,
-                      (u64) s->ptrs[i].offset);
+               pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+                      (u64) s->ptrs[i].offset,
+                      stripe_blockcount_get(s, i));
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
@@ -609,29 +577,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
        BUG_ON(h->data[m->heap_idx].idx != idx);
 }
 
-static inline unsigned stripe_entry_blocks(struct stripe *m)
-{
-       return atomic_read(&m->blocks_nonempty);
-}
-
 void bch2_stripes_heap_update(struct bch_fs *c,
                              struct stripe *m, size_t idx)
 {
        ec_stripes_heap *h = &c->ec_stripes_heap;
-       bool queue_delete;
        size_t i;
 
-       spin_lock(&c->ec_stripes_heap_lock);
-
-       if (!m->alive) {
-               spin_unlock(&c->ec_stripes_heap_lock);
-               return;
-       }
-
        heap_verify_backpointer(c, idx);
 
-       h->data[m->heap_idx].blocks_nonempty =
-               stripe_entry_blocks(m);
+       h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
 
        i = m->heap_idx;
        heap_sift_up(h,   i, ec_stripes_heap_cmp,
@@ -641,44 +595,35 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
        heap_verify_backpointer(c, idx);
 
-       queue_delete = stripe_idx_to_delete(c) >= 0;
-       spin_unlock(&c->ec_stripes_heap_lock);
-
-       if (queue_delete)
+       if (stripe_idx_to_delete(c) >= 0)
                schedule_work(&c->ec_stripe_delete_work);
 }
 
 void bch2_stripes_heap_del(struct bch_fs *c,
                           struct stripe *m, size_t idx)
 {
-       spin_lock(&c->ec_stripes_heap_lock);
        heap_verify_backpointer(c, idx);
 
        m->alive = false;
        heap_del(&c->ec_stripes_heap, m->heap_idx,
                 ec_stripes_heap_cmp,
                 ec_stripes_heap_set_backpointer);
-       spin_unlock(&c->ec_stripes_heap_lock);
 }
 
 void bch2_stripes_heap_insert(struct bch_fs *c,
                              struct stripe *m, size_t idx)
 {
-       spin_lock(&c->ec_stripes_heap_lock);
-
        BUG_ON(heap_full(&c->ec_stripes_heap));
 
        heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
                        .idx = idx,
-                       .blocks_nonempty = stripe_entry_blocks(m),
+                       .blocks_nonempty = m->blocks_nonempty,
                }),
                 ec_stripes_heap_cmp,
                 ec_stripes_heap_set_backpointer);
        m->alive = true;
 
        heap_verify_backpointer(c, idx);
-
-       spin_unlock(&c->ec_stripes_heap_lock);
 }
 
 /* stripe deletion */
@@ -1217,6 +1162,116 @@ unlock:
        mutex_unlock(&c->ec_new_stripe_lock);
 }
 
+static int __bch2_stripe_write_key(struct bch_fs *c,
+                                  struct btree_iter *iter,
+                                  struct stripe *m,
+                                  size_t idx,
+                                  struct bkey_i_stripe *new_key,
+                                  unsigned flags)
+{
+       struct bkey_s_c k;
+       unsigned i;
+       int ret;
+
+       bch2_btree_iter_set_pos(iter, POS(0, idx));
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = btree_iter_err(k);
+       if (ret)
+               return ret;
+
+       if (k.k->type != KEY_TYPE_stripe)
+               return -EIO;
+
+       bkey_reassemble(&new_key->k_i, k);
+
+       spin_lock(&c->ec_stripes_heap_lock);
+
+       for (i = 0; i < new_key->v.nr_blocks; i++)
+               stripe_blockcount_set(&new_key->v, i,
+                                     m->block_sectors[i]);
+       m->dirty = false;
+
+       spin_unlock(&c->ec_stripes_heap_lock);
+
+       return bch2_btree_insert_at(c, NULL, NULL,
+                                  BTREE_INSERT_NOFAIL|flags,
+                                  BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+}
+
+int bch2_stripes_write(struct bch_fs *c, bool *wrote)
+{
+       struct btree_iter iter;
+       struct genradix_iter giter;
+       struct bkey_i_stripe *new_key;
+       struct stripe *m;
+       int ret = 0;
+
+       new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
+       BUG_ON(!new_key);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+       genradix_for_each(&c->stripes[0], giter, m) {
+               if (!m->dirty)
+                       continue;
+
+               ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
+                                       new_key, BTREE_INSERT_NOCHECK_RW);
+               if (ret)
+                       break;
+
+               *wrote = true;
+       }
+
+       bch2_btree_iter_unlock(&iter);
+
+       kfree(new_key);
+
+       return ret;
+}
+
+static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+
+       struct gc_pos pos = { 0 };
+
+       bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
+}
+
+int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+       struct journal_replay *r;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = bch2_fs_ec_start(c);
+       if (ret)
+               return ret;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
+               bch2_stripe_read_key(c, k);
+               bch2_btree_iter_cond_resched(&iter);
+       }
+
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       list_for_each_entry(r, journal_replay_list, list) {
+               struct bkey_i *k, *n;
+               struct jset_entry *entry;
+
+               for_each_jset_key(k, n, entry, &r->j)
+                       if (entry->btree_id == BTREE_ID_EC)
+                               bch2_stripe_read_key(c, bkey_i_to_s_c(k));
+       }
+
+       return 0;
+}
+
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
        struct btree_iter iter;
index c728c52c73dbab63c6cc67500cfb2f801c5f4310..2817833086f0a761145de208e730bea54ca32cfb 100644 (file)
@@ -13,6 +13,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
        .val_to_text    = bch2_stripe_to_text,          \
 }
 
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+       return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+                           1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+                                         unsigned dev, unsigned csum_idx)
+{
+       unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+       return sizeof(struct bch_stripe) +
+               sizeof(struct bch_extent_ptr) * s->nr_blocks +
+               (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+                                               unsigned idx)
+{
+       return stripe_csum_offset(s, s->nr_blocks, 0) +
+               sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+                                            unsigned idx)
+{
+       return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+                                        unsigned idx, unsigned v)
+{
+       __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+       *p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+       return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+                           sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+                               unsigned dev, unsigned csum_idx)
+{
+       return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+}
+
 struct bch_read_bio;
 
 struct ec_stripe_buf {
@@ -100,6 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+int bch2_stripes_read(struct bch_fs *, struct list_head *);
+int bch2_stripes_write(struct bch_fs *, bool *);
+
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
 int bch2_fs_ec_start(struct bch_fs *);
index 44c5d3821a3868066f45975c595637d3ecc38a90..b4d377053c875d409084904cf6e6337d514dcfb2 100644 (file)
@@ -19,9 +19,10 @@ struct stripe {
        u8                      nr_blocks;
        u8                      nr_redundant;
 
-       u8                      alive;
-       atomic_t                blocks_nonempty;
-       atomic_t                block_sectors[EC_STRIPE_MAX];
+       unsigned                alive:1;
+       unsigned                dirty:1;
+       u8                      blocks_nonempty;
+       u16                     block_sectors[EC_STRIPE_MAX];
 
        struct bch_replicas_padded r;
 };
index 2980416871d8791360cc3c02455d27435b00116c..0f075fa1d3600b7cb4153c0c3198b1de3325dcc1 100644 (file)
@@ -1664,12 +1664,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
        return ret == BCH_MERGE_MERGE;
 }
 
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+                              unsigned nr_replicas)
 {
        struct btree_iter iter;
        struct bpos end = pos;
        struct bkey_s_c k;
-       int ret = 0;
+       bool ret = true;
 
        end.offset += size;
 
@@ -1678,8 +1679,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               if (!bch2_extent_is_fully_allocated(k)) {
-                       ret = -ENOSPC;
+               if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+                       ret = false;
                        break;
                }
        }
@@ -1688,6 +1689,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
        return ret;
 }
 
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+       unsigned ret = 0;
+
+       switch (k.k->type) {
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+
+               extent_for_each_ptr_decode(e, p, entry)
+                       ret += !p.ptr.cached &&
+                               p.crc.compression_type == BCH_COMPRESSION_NONE;
+               break;
+       }
+       case KEY_TYPE_reservation:
+               ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+               break;
+       }
+
+       return ret;
+}
+
 /* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
index 0e6f4a0bbcabd7bd676ecc5e9afc497d6aa9dd67..698b25818afbaeda89954a42f7258579eaddd0f2 100644 (file)
@@ -571,6 +571,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
                BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 
 #endif /* _BCACHEFS_EXTENTS_H */
index 66fa227c552d326f22260b868cd5f1bee4eca98f..d19d809c7580d8fbd61dc3107ae224622eb63e55 100644 (file)
@@ -262,18 +262,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
        }
 }
 
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-                                    eytzinger_cmp_fn cmp, const void *search)
-{
-       size_t i = 0;
-       int res;
-
-       while (i < nr &&
-              (res = cmp(search, base + i * size, size)))
-               i = eytzinger0_child(i, res > 0);
-
-       return i;
-}
+#define eytzinger0_find(base, nr, size, _cmp, search)                  \
+({                                                                     \
+       void *_base     = (base);                                       \
+       void *_search   = (search);                                     \
+       size_t _nr      = (nr);                                         \
+       size_t _size    = (size);                                       \
+       size_t _i       = 0;                                            \
+       int _res;                                                       \
+                                                                       \
+       while (_i < _nr &&                                              \
+              (_res = _cmp(_search, _base + _i * _size, _size)))       \
+               _i = eytzinger0_child(_i, _res > 0);                    \
+       _i;                                                             \
+})
 
 void eytzinger0_sort(void *, size_t, size_t,
                    int (*cmp_func)(const void *, const void *, size_t),
index c1739f53a9f24a99b5d8b1bb22415d8a30a9b8ed..2cfc2d9e7db5137f7944032e9e13a74d30185235 100644 (file)
@@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
                BUG_ON(btree_iter_err(old));
 
                if (allocating &&
-                   !bch2_extent_is_fully_allocated(old))
+                   !*allocating &&
+                   bch2_bkey_nr_ptrs_allocated(old) <
+                   bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
                        *allocating = true;
 
                delta += (min(new->k.p.offset,
@@ -858,9 +860,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
-       unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-               ? bch2_bkey_nr_dirty_ptrs(k)
-               : 0;
+       unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
 
        bio_for_each_segment(bv, bio, iter) {
                /* brand new pages, don't need to be locked: */
@@ -1759,6 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
        struct bch_inode_info *inode = dio->iop.inode;
        struct bio *bio = &dio->iop.op.wbio.bio;
        struct bio_vec *bv;
+       loff_t offset;
        bool sync;
        long ret;
        int i;
@@ -1770,12 +1771,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
        __pagecache_block_get(&mapping->add_lock);
 
        /* Write and invalidate pagecache range that we're writing to: */
-       ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
-                               req->ki_pos + iov_iter_count(&dio->iter) - 1);
+       offset = req->ki_pos + (dio->iop.op.written << 9);
+       ret = write_invalidate_inode_pages_range(mapping,
+                                       offset,
+                                       offset + iov_iter_count(&dio->iter) - 1);
        if (unlikely(ret))
                goto err;
 
        while (1) {
+               offset = req->ki_pos + (dio->iop.op.written << 9);
+
                BUG_ON(current->pagecache_lock);
                current->pagecache_lock = &mapping->add_lock;
                if (kthread)
@@ -1792,13 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
                /* gup might have faulted pages back in: */
                ret = write_invalidate_inode_pages_range(mapping,
-                               req->ki_pos + (dio->iop.op.written << 9),
-                               req->ki_pos + iov_iter_count(&dio->iter) - 1);
+                               offset,
+                               offset + bio->bi_iter.bi_size - 1);
                if (unlikely(ret))
                        goto err;
 
-               dio->iop.op.pos = POS(inode->v.i_ino,
-                               (req->ki_pos >> 9) + dio->iop.op.written);
+               dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
 
                task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1878,7 +1882,6 @@ static int bch2_direct_IO_write(struct kiocb *req,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct dio_write *dio;
        struct bio *bio;
-       loff_t offset = req->ki_pos;
        ssize_t ret;
 
        lockdep_assert_held(&inode->v.i_rwsem);
@@ -1886,7 +1889,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
        if (unlikely(!iter->count))
                return 0;
 
-       if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
                return -EINVAL;
 
        bio = bio_alloc_bioset(GFP_KERNEL,
@@ -1898,7 +1901,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
        dio->mm                 = current->mm;
        dio->loop               = false;
        dio->sync               = is_sync_kiocb(req) ||
-               offset + iter->count > inode->v.i_size;
+               req->ki_pos + iter->count > inode->v.i_size;
        dio->free_iov           = false;
        dio->quota_res.sectors  = 0;
        dio->iter               = *iter;
@@ -1915,19 +1918,20 @@ static int bch2_direct_IO_write(struct kiocb *req,
        if (unlikely(ret))
                goto err;
 
+       dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
+
        ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
                                        dio->iop.op.opts.data_replicas, 0);
        if (unlikely(ret)) {
-               if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-                                                     offset >> 9),
-                                              iter->count >> 9))
+               if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
+                                                      req->ki_pos >> 9),
+                                               iter->count >> 9,
+                                               dio->iop.op.opts.data_replicas))
                        goto err;
 
                dio->iop.unalloc = true;
        }
 
-       dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
-
        return bch2_dio_write_loop(dio);
 err:
        bch2_disk_reservation_put(c, &dio->iop.op.res);
index 52498627f70e4dbcf36eb53a63c1d1fe365a50af..5cc0651c7449f761750a553ccd70d87b2e92da3c 100644 (file)
@@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        }
 
        list_for_each_entry(i, list, list) {
+               struct bch_replicas_padded replicas;
+               char buf[80];
+
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
@@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-                                                      i->devs, false), c,
-                                "superblock not marked as containing replicas (type %u)",
-                                BCH_DATA_JOURNAL))) {
-                       ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+                    fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+                                "superblock not marked as containing replicas %s",
+                                (bch2_replicas_entry_to_text(&PBUF(buf),
+                                                             &replicas.e), buf)))) {
+                       ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
                                return ret;
                }
@@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
        struct journal_buf *w = journal_prev_buf(j);
        struct bch_devs_list devs =
                bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+       struct bch_replicas_padded replicas;
        u64 seq = le64_to_cpu(w->data->seq);
        u64 last_seq = le64_to_cpu(w->data->last_seq);
 
@@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
                goto err;
        }
 
-       if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+       bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+       if (bch2_mark_replicas(c, &replicas.e))
                goto err;
 
        spin_lock(&j->lock);
index 4a9973664a3667fbcdd9c40efdbd81ffcc07ada6..a795e888c56b4d699641db79efea1019ca9855f5 100644 (file)
@@ -335,7 +335,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
                mutex_unlock(&j->reclaim_lock);
 
        if (!test_bit(BCH_FS_RO, &c->flags))
-               queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+               queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
                                   msecs_to_jiffies(j->reclaim_delay_ms));
 }
 
@@ -387,7 +387,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_entry_pin_list *p;
-       struct bch_devs_list devs;
        u64 iter, seq = 0;
        int ret = 0;
 
@@ -412,12 +411,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 
        spin_lock(&j->lock);
        while (!ret && seq < j->pin.back) {
+               struct bch_replicas_padded replicas;
+
                seq = max(seq, journal_last_seq(j));
-               devs = journal_seq_pin(j, seq)->devs;
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+                                        journal_seq_pin(j, seq)->devs);
                seq++;
 
                spin_unlock(&j->lock);
-               ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+               ret = bch2_mark_replicas(c, &replicas.e);
                spin_lock(&j->lock);
        }
        spin_unlock(&j->lock);
index b21986514d66335f49791a3184bc7149c92e94f0..bb425d88a84a653439dd76404e386e250c57d6a8 100644 (file)
@@ -4,6 +4,7 @@
 
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "extents.h"
 #include "io.h"
@@ -152,6 +153,16 @@ retry:
                bch2_btree_iter_unlock(&iter);
        }
 
+       /* flush relevant btree updates */
+       while (1) {
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_nr_pending(c) ||
+                                  c->btree_roots_dirty);
+               if (!bch2_btree_interior_updates_nr_pending(c))
+                       break;
+               bch2_journal_meta(&c->journal);
+       }
+
        ret = 0;
 out:
        ret = bch2_replicas_gc_end(c, ret);
index 80909ae4fb6dee401d0067ff1202156bb0728c82..98cfcefd9cdfb9aa543a0cc336117f2ebb71ae36 100644 (file)
@@ -3,6 +3,7 @@
 #include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
 #include "inode.h"
@@ -763,6 +764,16 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
                ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+               while (1) {
+                       closure_wait_event(&c->btree_interior_update_wait,
+                                          !bch2_btree_interior_updates_nr_pending(c) ||
+                                          c->btree_roots_dirty);
+                       if (!bch2_btree_interior_updates_nr_pending(c))
+                               break;
+                       bch2_journal_meta(&c->journal);
+               }
+
                ret = bch2_gc_btree_replicas(c) ?: ret;
 
                ret = bch2_move_data(c, NULL,
index eae38ea7d5be9ec1b1ffbf0a47665655e80c7f50..f5f3f94ea44af8d32d135b4b8280070d1545a8e8 100644 (file)
@@ -214,12 +214,12 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (ret)
                goto err;
 
-       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
-       err = "cannot allocate memory";
-       ret = bch2_fs_ec_start(c);
+       ret = bch2_stripes_read(c, &journal);
        if (ret)
                goto err;
+       pr_info("stripes_read done");
+
+       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
        bch_verbose(c, "starting mark and sweep:");
        err = "error in recovery";
index 66ca13aab4e88d12f5a0065828333c03a2ee1666..230f807bdf107cd8ea84843bc2639d15cc6d7b42 100644 (file)
@@ -13,6 +13,16 @@ static inline int u8_cmp(u8 l, u8 r)
        return (l > r) - (l < r);
 }
 
+static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHES_DEBUG
+       unsigned i;
+
+       for (i = 0; i + 1 < e->nr_devs; i++)
+               BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
 static void replicas_entry_sort(struct bch_replicas_entry *e)
 {
        bubble_sort(e->devs, e->nr_devs, u8_cmp);
@@ -23,19 +33,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
             (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
             _i = (void *) (_i) + (_r)->entry_size)
 
-static inline struct bch_replicas_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-       return (void *) r->entries + r->entry_size * i;
-}
-
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
        eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static void replicas_entry_to_text(struct printbuf *out,
-                                 struct bch_replicas_entry *e)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+                                struct bch_replicas_entry *e)
 {
        unsigned i;
 
@@ -60,7 +64,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
                        pr_buf(out, " ");
                first = false;
 
-               replicas_entry_to_text(out, e);
+               bch2_replicas_entry_to_text(out, e);
        }
 }
 
@@ -100,8 +104,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
                r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(struct bkey_s_c k,
-                            struct bch_replicas_entry *e)
+static void bkey_to_replicas(struct bch_replicas_entry *e,
+                            struct bkey_s_c k)
 {
        e->nr_devs = 0;
 
@@ -119,11 +123,13 @@ static void bkey_to_replicas(struct bkey_s_c k,
                stripe_to_replicas(k, e);
                break;
        }
+
+       replicas_entry_sort(e);
 }
 
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-                                      enum bch_data_type data_type,
-                                      struct bch_replicas_entry *e)
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+                             enum bch_data_type data_type,
+                             struct bch_devs_list devs)
 {
        unsigned i;
 
@@ -137,6 +143,8 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
 
        for (i = 0; i < devs.nr; i++)
                e->devs[e->nr_devs++] = devs.devs[i];
+
+       replicas_entry_sort(e);
 }
 
 static struct bch_replicas_cpu
@@ -150,6 +158,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
                                        replicas_entry_bytes(new_entry)),
        };
 
+       BUG_ON(!new_entry->data_type);
+       verify_replicas_entry_sorted(new_entry);
+
        new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
        if (!new.entries)
                return new;
@@ -175,13 +186,12 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
        if (unlikely(entry_size > r->entry_size))
                return -1;
 
-       replicas_entry_sort(search);
-
-       while (entry_size < r->entry_size)
-               ((char *) search)[entry_size++] = 0;
+       verify_replicas_entry_sorted(search);
 
+#define entry_cmp(_l, _r, size)        memcmp(_l, _r, entry_size)
        idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
-                             memcmp, search);
+                             entry_cmp, search);
+#undef entry_cmp
 
        return idx < r->nr ? idx : -1;
 }
@@ -189,6 +199,8 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 int bch2_replicas_entry_idx(struct bch_fs *c,
                            struct bch_replicas_entry *search)
 {
+       replicas_entry_sort(search);
+
        return __replicas_entry_idx(&c->replicas, search);
 }
 
@@ -198,12 +210,17 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
        return __replicas_entry_idx(r, search) >= 0;
 }
 
-static bool replicas_has_entry(struct bch_fs *c,
-                              struct bch_replicas_entry *search,
-                              bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+                         struct bch_replicas_entry *search,
+                         bool check_gc_replicas)
 {
        bool marked;
 
+       if (!search->nr_devs)
+               return true;
+
+       verify_replicas_entry_sorted(search);
+
        percpu_down_read_preempt_disable(&c->mark_lock);
        marked = __replicas_has_entry(&c->replicas, search) &&
                (!check_gc_replicas ||
@@ -214,35 +231,31 @@ static bool replicas_has_entry(struct bch_fs *c,
        return marked;
 }
 
-static void __replicas_table_update(struct bch_fs_usage __percpu *dst,
+static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
                                    struct bch_replicas_cpu *dst_r,
-                                   struct bch_fs_usage __percpu *src,
+                                   struct bch_fs_usage __percpu *src_p,
                                    struct bch_replicas_cpu *src_r)
 {
-       int src_idx, dst_idx, cpu;
+       unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+       struct bch_fs_usage *dst, *src = (void *)
+               bch2_acc_percpu_u64s((void *) src_p, src_nr);
+       int src_idx, dst_idx;
 
-       for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
-               u64 *dst_v, src_v = 0;
+       preempt_disable();
+       dst = this_cpu_ptr(dst_p);
+       preempt_enable();
 
-               for_each_possible_cpu(cpu)
-                       src_v += *per_cpu_ptr(&src->data[src_idx], cpu);
+       *dst = *src;
 
-               dst_idx = __replicas_entry_idx(dst_r,
-                               cpu_replicas_entry(src_r, src_idx));
-
-               if (dst_idx < 0) {
-                       BUG_ON(src_v);
+       for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+               if (!src->data[src_idx])
                        continue;
-               }
-
-               preempt_disable();
 
-               dst_v = this_cpu_ptr(&dst->data[dst_idx]);
-               BUG_ON(*dst_v);
-
-               *dst_v = src_v;
+               dst_idx = __replicas_entry_idx(dst_r,
+                               cpu_replicas_entry(src_r, src_idx));
+               BUG_ON(dst_idx < 0);
 
-               preempt_enable();
+               dst->data[dst_idx] = src->data[src_idx];
        }
 }
 
@@ -344,30 +357,32 @@ err:
        return ret;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-                               struct bch_replicas_entry *devs)
+int bch2_mark_replicas(struct bch_fs *c,
+                      struct bch_replicas_entry *r)
 {
-       return likely(replicas_has_entry(c, devs, true))
+       return likely(bch2_replicas_marked(c, r, true))
                ? 0
-               : bch2_mark_replicas_slowpath(c, devs);
+               : bch2_mark_replicas_slowpath(c, r);
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-                      enum bch_data_type data_type,
-                      struct bch_devs_list devs)
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+                              struct bkey_s_c k,
+                              bool check_gc_replicas)
 {
        struct bch_replicas_padded search;
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
 
-       if (!devs.nr)
-               return 0;
-
-       memset(&search, 0, sizeof(search));
+       for (i = 0; i < cached.nr; i++) {
+               bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+               if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+                       return false;
+       }
 
-       devlist_to_replicas(devs, data_type, &search.e);
+       bkey_to_replicas(&search.e, k);
 
-       return __bch2_mark_replicas(c, &search.e);
+       return bch2_replicas_marked(c, &search.e, check_gc_replicas);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@@ -377,18 +392,17 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
        unsigned i;
        int ret;
 
-       memset(&search, 0, sizeof(search));
+       for (i = 0; i < cached.nr; i++) {
+               bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-       for (i = 0; i < cached.nr; i++)
-               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-                                             bch2_dev_list_single(cached.devs[i]))))
+               ret = bch2_mark_replicas(c, &search.e);
+               if (ret)
                        return ret;
+       }
 
-       bkey_to_replicas(k, &search.e);
+       bkey_to_replicas(&search.e, k);
 
-       return search.e.nr_devs
-               ? __bch2_mark_replicas(c, &search.e)
-               : 0;
+       return bch2_mark_replicas(c, &search.e);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@@ -749,7 +763,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
                        pr_buf(out, " ");
                first = false;
 
-               replicas_entry_to_text(out, e);
+               bch2_replicas_entry_to_text(out, e);
        }
 }
 
@@ -798,46 +812,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 /* Query replicas: */
 
-bool bch2_replicas_marked(struct bch_fs *c,
-                         enum bch_data_type data_type,
-                         struct bch_devs_list devs,
-                         bool check_gc_replicas)
-{
-       struct bch_replicas_padded search;
-
-       if (!devs.nr)
-               return true;
-
-       memset(&search, 0, sizeof(search));
-
-       devlist_to_replicas(devs, data_type, &search.e);
-
-       return replicas_has_entry(c, &search.e, check_gc_replicas);
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-                              struct bkey_s_c k,
-                              bool check_gc_replicas)
-{
-       struct bch_replicas_padded search;
-       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-       unsigned i;
-
-       memset(&search, 0, sizeof(search));
-
-       for (i = 0; i < cached.nr; i++)
-               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-                                         bch2_dev_list_single(cached.devs[i]),
-                                         check_gc_replicas))
-                       return false;
-
-       bkey_to_replicas(k, &search.e);
-
-       return search.e.nr_devs
-               ? replicas_has_entry(c, &search.e, check_gc_replicas)
-               : true;
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
                                              struct bch_devs_mask online_devs)
 {
index fc833653b8590399d5be56a93e06a2e1c75dc48d..0ac2b8e082cdda0be8991c74be8d79f0ecca09ec 100644 (file)
@@ -4,17 +4,39 @@
 #include "eytzinger.h"
 #include "replicas_types.h"
 
+void bch2_replicas_entry_to_text(struct printbuf *,
+                                struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
+
 int bch2_replicas_entry_idx(struct bch_fs *,
                            struct bch_replicas_entry *);
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-                         struct bch_devs_list, bool);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+                             enum bch_data_type,
+                             struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+                         struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+                      struct bch_replicas_entry *);
+
 bool bch2_bkey_replicas_marked(struct bch_fs *,
                               struct bkey_s_c, bool);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-                      struct bch_devs_list);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+                                             unsigned dev)
+{
+       e->data_type    = BCH_DATA_CACHED;
+       e->nr_devs      = 1;
+       e->nr_required  = 1;
+       e->devs[0]      = dev;
+}
 
 struct replicas_status {
        struct {
index a539f2a82a7325ad7c3c51161110a55d8de7c437..1835b5355c67df3a9d9e16d92ad1cb531bbc9b26 100644 (file)
@@ -205,7 +205,9 @@ int bch2_congested(void *data, int bdi_bits)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
+       bool wrote;
        unsigned i;
+       int ret;
 
        bch2_rebalance_stop(c);
 
@@ -220,23 +222,42 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         */
        bch2_journal_flush_all_pins(&c->journal);
 
-       for_each_member_device(ca, c, i)
-               bch2_dev_allocator_stop(ca);
+       do {
+               ret = bch2_alloc_write(c, false, &wrote);
+               if (ret) {
+                       bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+                       break;
+               }
 
-       bch2_journal_flush_all_pins(&c->journal);
+               ret = bch2_stripes_write(c, &wrote);
+               if (ret) {
+                       bch2_fs_inconsistent(c, "error writing out stripes");
+                       break;
+               }
 
-       /*
-        * We need to explicitly wait on btree interior updates to complete
-        * before stopping the journal, flushing all journal pins isn't
-        * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
-        * interior updates have to drop their journal pin before they're
-        * fully complete:
-        */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+               for_each_member_device(ca, c, i)
+                       bch2_dev_allocator_quiesce(c, ca);
+
+               bch2_journal_flush_all_pins(&c->journal);
+
+               /*
+                * We need to explicitly wait on btree interior updates to complete
+                * before stopping the journal, flushing all journal pins isn't
+                * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+                * interior updates have to drop their journal pin before they're
+                * fully complete:
+                */
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_nr_pending(c));
+       } while (wrote);
+
+       for_each_member_device(ca, c, i)
+               bch2_dev_allocator_stop(ca);
 
        bch2_fs_journal_stop(&c->journal);
 
+       /* XXX: mark super that alloc info is persistent */
+
        /*
         * the journal kicks off btree writes via reclaim - wait for in flight
         * writes after stopping journal:
@@ -420,6 +441,8 @@ static void bch2_fs_free(struct bch_fs *c)
        kfree(c->replicas_gc.entries);
        kfree(rcu_dereference_protected(c->disk_groups, 1));
 
+       if (c->journal_reclaim_wq)
+               destroy_workqueue(c->journal_reclaim_wq);
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
        if (c->wq)
@@ -638,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcache_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+           !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
            mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
                                      sizeof(struct btree_reserve)) ||
@@ -1297,8 +1322,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
        if (data) {
                char data_has_str[100];
 
-               bch2_string_opt_to_text(&PBUF(data_has_str),
-                                       bch2_data_types, data);
+               bch2_flags_to_text(&PBUF(data_has_str),
+                                  bch2_data_types, data);
                bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
                ret = -EBUSY;
                goto err;
index 2e6e9bd587ee4cf5a7408be0e237c2e25debdc59..40384e7e5af8314ffa3cbb021ffa2555214214e2 100644 (file)
@@ -234,17 +234,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
        struct printbuf out = _PBUF(buf, PAGE_SIZE);
        struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
-       unsigned replicas;
+       unsigned i;
 
        if (!fs_usage)
                return -ENOMEM;
 
        pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
 
-       for (replicas = 0;
-            replicas < ARRAY_SIZE(fs_usage->persistent_reserved);
-            replicas++) {
-               pr_buf(&out, "%u replicas:\n", replicas + 1);
+       for (i = 0;
+            i < ARRAY_SIZE(fs_usage->persistent_reserved);
+            i++) {
+               pr_buf(&out, "%u replicas:\n", i + 1);
 #if 0
                for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
                        pr_buf(&out, "\t%s:\t\t%llu\n",
@@ -254,12 +254,23 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
                       stats.replicas[replicas].ec_data);
 #endif
                pr_buf(&out, "\treserved:\t%llu\n",
-                      fs_usage->persistent_reserved[replicas]);
+                      fs_usage->persistent_reserved[i]);
        }
 
        pr_buf(&out, "online reserved:\t%llu\n",
               fs_usage->s.online_reserved);
 
+       for (i = 0; i < c->replicas.nr; i++) {
+               struct bch_replicas_entry *e =
+                       cpu_replicas_entry(&c->replicas, i);
+
+               pr_buf(&out, "\t");
+               bch2_replicas_entry_to_text(&out, e);
+               pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+       }
+
+       percpu_up_read_preempt_enable(&c->mark_lock);
+
        kfree(fs_usage);
 
        return out.pos - buf;
@@ -797,6 +808,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 {
        struct bch_fs *c = ca->fs;
        struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+       unsigned i, nr[BCH_DATA_NR];
+
+       memset(nr, 0, sizeof(nr));
+
+       for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+               nr[c->open_buckets[i].type]++;
 
        return scnprintf(buf, PAGE_SIZE,
                "free_inc:               %zu/%zu\n"
@@ -823,7 +840,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                "    copygc threshold:   %llu\n"
                "freelist_wait:          %s\n"
                "open buckets:           %u/%u (reserved %u)\n"
-               "open_buckets_wait:      %s\n",
+               "open_buckets_wait:      %s\n"
+               "open_buckets_btree:     %u\n"
+               "open_buckets_user:      %u\n"
+               "btree reserve cache:    %u\n",
                fifo_used(&ca->free_inc),               ca->free_inc.size,
                fifo_used(&ca->free[RESERVE_BTREE]),    ca->free[RESERVE_BTREE].size,
                fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
@@ -845,8 +865,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                stats.sectors_fragmented,
                ca->copygc_threshold,
                c->freelist_wait.list.first             ? "waiting" : "empty",
-               c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
-               c->open_buckets_wait.list.first         ? "waiting" : "empty");
+               c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+               BTREE_NODE_OPEN_BUCKET_RESERVE,
+               c->open_buckets_wait.list.first         ? "waiting" : "empty",
+               nr[BCH_DATA_BTREE],
+               nr[BCH_DATA_USER],
+               c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
index 5c060e77fe0fef0cdb24f8381e8a12fe19dbcc49..fea80e248667c5efc3786aeb1676c89ad7cca218 100644 (file)
@@ -133,6 +133,7 @@ void bch2_flags_to_text(struct printbuf *out,
                        const char * const list[], u64 flags)
 {
        unsigned bit, nr = 0;
+       bool first = true;
 
        if (out->pos != out->end)
                *out->pos = '\0';
@@ -141,7 +142,10 @@ void bch2_flags_to_text(struct printbuf *out,
                nr++;
 
        while (flags && (bit = __ffs(flags)) < nr) {
-               pr_buf(out, "%s,", list[bit]);
+               pr_buf(out, "%s", list[bit]);
+               if (!first)
+                       pr_buf(out, ",");
+               first = false;
                flags ^= 1 << bit;
        }
 }
@@ -894,3 +898,28 @@ void eytzinger0_find_test(void)
        kfree(test_array);
 }
 #endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+       u64 *ret;
+       int cpu;
+
+       preempt_disable();
+       ret = this_cpu_ptr(p);
+       preempt_enable();
+
+       for_each_possible_cpu(cpu) {
+               u64 *i = per_cpu_ptr(p, cpu);
+
+               if (i != ret) {
+                       acc_u64s(ret, i, nr);
+                       memset(i, 0, nr * sizeof(u64));
+               }
+       }
+
+       return ret;
+}
index 25d6750915e30661d7c588b29491e1fda99a54e3..fbfb2085801c94310c4e9767b776eb96904cbe35 100644 (file)
@@ -715,4 +715,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
                acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
 }
 
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
 #endif /* _BCACHEFS_UTIL_H */
index 5c4a275ea3f57fcd295b9b0fab306780efc28d91..4f43d0bb4e9a8a668ed6f718336da69db4998bee 100644 (file)
@@ -1,4 +1,5 @@
 
+#include <linux/atomic.h>
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
@@ -16,7 +17,7 @@ struct genradix_node {
        };
 };
 
-static inline unsigned genradix_depth_shift(unsigned depth)
+static inline int genradix_depth_shift(unsigned depth)
 {
        return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
 }
@@ -29,16 +30,34 @@ static inline size_t genradix_depth_size(unsigned depth)
        return 1UL << genradix_depth_shift(depth);
 }
 
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH     \
+       DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK                            \
+       ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+       return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+       return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
 /*
  * Returns pointer to the specified byte @offset within @radix, or NULL if not
  * allocated
  */
 void *__genradix_ptr(struct __genradix *radix, size_t offset)
 {
-       size_t level = radix->depth;
-       struct genradix_node *n = radix->root;
+       struct genradix_root *r = READ_ONCE(radix->root);
+       struct genradix_node *n = genradix_root_to_node(r);
+       unsigned level          = genradix_root_to_depth(r);
 
-       if (offset >= genradix_depth_size(radix->depth))
+       if (ilog2(offset) >= genradix_depth_shift(level))
                return NULL;
 
        while (1) {
@@ -64,43 +83,60 @@ EXPORT_SYMBOL(__genradix_ptr);
 void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
                           gfp_t gfp_mask)
 {
-       struct genradix_node **n;
-       size_t level;
+       struct genradix_root *v = READ_ONCE(radix->root);
+       struct genradix_node *n, *new_node = NULL;
+       unsigned level;
 
        /* Increase tree depth if necessary: */
+       while (1) {
+               struct genradix_root *r = v, *new_root;
 
-       while (offset >= genradix_depth_size(radix->depth)) {
-               struct genradix_node *new_root =
-                       (void *) __get_free_page(gfp_mask|__GFP_ZERO);
-
-               if (!new_root)
-                       return NULL;
-
-               new_root->children[0] = radix->root;
-               radix->root = new_root;
-               radix->depth++;
-       }
+               n       = genradix_root_to_node(r);
+               level   = genradix_root_to_depth(r);
 
-       n = &radix->root;
-       level = radix->depth;
+               if (n && ilog2(offset) < genradix_depth_shift(level))
+                       break;
 
-       while (1) {
-               if (!*n) {
-                       *n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
-                       if (!*n)
+               if (!new_node) {
+                       new_node = (void *)
+                               __get_free_page(gfp_mask|__GFP_ZERO);
+                       if (!new_node)
                                return NULL;
                }
 
-               if (!level)
-                       break;
+               new_node->children[0] = n;
+               new_root = ((struct genradix_root *)
+                           ((unsigned long) new_node | (n ? level + 1 : 0)));
 
-               level--;
+               if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
+                       v = new_root;
+                       new_node = NULL;
+               }
+       }
 
-               n = &(*n)->children[offset >> genradix_depth_shift(level)];
+       while (level--) {
+               struct genradix_node **p =
+                       &n->children[offset >> genradix_depth_shift(level)];
                offset &= genradix_depth_size(level) - 1;
+
+               n = READ_ONCE(*p);
+               if (!n) {
+                       if (!new_node) {
+                               new_node = (void *)
+                                       __get_free_page(gfp_mask|__GFP_ZERO);
+                               if (!new_node)
+                                       return NULL;
+                       }
+
+                       if (!(n = cmpxchg_release(p, NULL, new_node)))
+                               swap(n, new_node);
+               }
        }
 
-       return &(*n)->data[offset];
+       if (new_node)
+               free_page((unsigned long) new_node);
+
+       return &n->data[offset];
 }
 EXPORT_SYMBOL(__genradix_ptr_alloc);
 
@@ -108,17 +144,19 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
                           struct __genradix *radix,
                           size_t objs_per_page)
 {
+       struct genradix_root *r;
        struct genradix_node *n;
-       size_t level, i;
-
-       if (!radix->root)
-               return NULL;
+       unsigned level, i;
 restart:
-       if (iter->offset >= genradix_depth_size(radix->depth))
+       r = READ_ONCE(radix->root);
+       if (!r)
                return NULL;
 
-       n       = radix->root;
-       level   = radix->depth;
+       n       = genradix_root_to_node(r);
+       level   = genradix_root_to_depth(r);
+
+       if (ilog2(iter->offset) >= genradix_depth_shift(level))
+               return NULL;
 
        while (level) {
                level--;
@@ -157,11 +195,24 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
        free_page((unsigned long) n);
 }
 
+int __genradix_prealloc(struct __genradix *radix, size_t size,
+                       gfp_t gfp_mask)
+{
+       size_t offset;
+
+       for (offset = 0; offset < size; offset += PAGE_SIZE)
+               if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+                       return -ENOMEM;
+
+       return 0;
+}
+EXPORT_SYMBOL(__genradix_prealloc);
+
 void __genradix_free(struct __genradix *radix)
 {
-       genradix_free_recurse(radix->root, radix->depth);
+       struct genradix_root *r = xchg(&radix->root, NULL);
 
-       radix->root = NULL;
-       radix->depth = 0;
+       genradix_free_recurse(genradix_root_to_node(r),
+                             genradix_root_to_depth(r));
 }
 EXPORT_SYMBOL(__genradix_free);