]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 14e9ac5016 bcachefs: btree_iter fastpath
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 13 May 2017 02:45:15 +0000 (18:45 -0800)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 13 May 2017 07:14:24 +0000 (23:14 -0800)
39 files changed:
.bcachefs_revision
cmd_migrate.c
include/linux/compiler.h
include/linux/mempool.h
include/linux/slab.h
include/linux/vmalloc.h
libbcachefs/alloc.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_types.h
libbcachefs/btree_update.c
libbcachefs/btree_update.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/util.c
libbcachefs/util.h
linux/sched.c

index 8bc4e35a485017cc2e85dadffdc4dd3dd7f43eb6..d2d0c51968b99816412bea41419ed2eed8bdb7a6 100644 (file)
@@ -1 +1 @@
-43e3159567958ea70c8a95d98fdb6e881153a656
+14e9ac5016803fc63c1216608c866bef16b4053e
index bf8f0bea7a2520cda9252512fd15e48a4c5fbf62..82fa0f125be3400a965475db7587a301a93a189a 100644 (file)
@@ -250,7 +250,6 @@ static void write_data(struct bch_fs *c,
 {
        struct disk_reservation res;
        struct bch_write_op op;
-       struct bch_write_bio bio;
        struct bio_vec bv;
        struct closure cl;
 
@@ -259,15 +258,15 @@ static void write_data(struct bch_fs *c,
 
        closure_init_stack(&cl);
 
-       bio_init(&bio.bio, &bv, 1);
-       bio.bio.bi_iter.bi_size = len;
-       bch2_bio_map(&bio.bio, buf);
+       bio_init(&op.wbio.bio, &bv, 1);
+       op.wbio.bio.bi_iter.bi_size = len;
+       bch2_bio_map(&op.wbio.bio, buf);
 
        int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
        if (ret)
                die("error reserving space in new filesystem: %s", strerror(-ret));
 
-       bch2_write_op_init(&op, c, &bio, res, c->write_points,
+       bch2_write_op_init(&op, c, res, c->write_points,
                           POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
        closure_call(&op.cl, bch2_write, NULL, &cl);
        closure_sync(&cl);
index e5c31a6ce26fae866984f9a90158fdfed47d866d..915a6f88643fc6571656eff48d681747ca31514a 100644 (file)
@@ -166,4 +166,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define flush_cache_vmap(start, end)           do { } while (0)
 #define flush_cache_vunmap(start, end)         do { } while (0)
 
+#ifdef __x86_64
+#define CONFIG_X86_64  y
+#endif
+
 #endif /* _TOOLS_LINUX_COMPILER_H */
index ddf6f94130afa11bf40819986d04f38ba11e4af4..37d81492edf6de0e3d173a4fb9b4c06572ade791 100644 (file)
 
 struct kmem_cache;
 
+typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
 typedef struct mempool_s {
-       size_t          elem_size;
+       size_t                  elem_size;
+       void                    *pool_data;
+       mempool_alloc_t         *alloc;
+       mempool_free_t          *free;
 } mempool_t;
 
 static inline bool mempool_initialized(mempool_t *pool)
@@ -60,24 +66,22 @@ static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t
        return 0;
 }
 
-static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
-{
-       mempool_t *pool = malloc(sizeof(*pool));
-       pool->elem_size = size;
-       return pool;
-}
-
 static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
 {
        pool->elem_size = PAGE_SIZE << order;
        return 0;
 }
 
-static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+static inline int mempool_init(mempool_t *pool, int min_nr,
+                              mempool_alloc_t *alloc_fn,
+                              mempool_free_t *free_fn,
+                              void *pool_data)
 {
-       mempool_t *pool = malloc(sizeof(*pool));
-       pool->elem_size = PAGE_SIZE << order;
-       return pool;
+       pool->elem_size = (size_t) pool_data;
+       pool->pool_data = pool_data;
+       pool->alloc     = alloc_fn;
+       pool->free      = free_fn;
+       return 0;
 }
 
 #endif /* _LINUX_MEMPOOL_H */
index 58fb73edc3ee800a2ae04af3a3bca661dba924b8..d0d8790db080d1c7e1e3c5be063ec5bcbdc91c64 100644 (file)
@@ -43,9 +43,6 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 #define kcalloc(n, size, flags)                calloc(n, size)
 #define kmalloc_array(n, size, flags)  calloc(n, size)
 
-#define vmalloc(size)                  malloc(size)
-#define vzalloc(size)                  calloc(1, size)
-
 #define kfree(p)                       free(p)
 #define kvfree(p)                      free(p)
 #define kzfree(p)                      free(p)
@@ -89,8 +86,6 @@ do {                                                  \
 #define VM_NO_GUARD            0x00000040      /* don't add guard page */
 #define VM_KASAN               0x00000080      /* has allocated kasan shadow memory */
 
-#define PAGE_KERNEL            0
-
 static inline void vunmap(const void *addr) {}
 
 static inline void *vmap(struct page **pages, unsigned int count,
index eb6284d70e303292d9bc99286a2729518d5d591e..debdcedbea6d05fd520f3af82e37911bf1ec668c 100644 (file)
@@ -1,8 +1,41 @@
 #ifndef __TOOLS_LINUX_VMALLOC_H
 #define __TOOLS_LINUX_VMALLOC_H
 
-#define vmalloc(size)          malloc(size)
-#define __vmalloc(size, flags, prot)   malloc(size)
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tools-util.h"
+
+#define PAGE_KERNEL            0
+#define PAGE_KERNEL_EXEC       1
+
 #define vfree(p)               free(p)
 
+static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot)
+{
+       void *p = aligned_alloc(PAGE_SIZE, size);
+
+       if (p && prot == PAGE_KERNEL_EXEC) {
+               if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+                       vfree(p);
+                       p = NULL;
+               }
+       }
+
+       if (p && (gfp_mask & __GFP_ZERO))
+               memset(p, 0, size);
+
+       return p;
+}
+
+static inline void *vmalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+
+static inline void *vzalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL|__GFP_ZERO, PAGE_KERNEL);
+}
+
 #endif /* __TOOLS_LINUX_VMALLOC_H */
index a12c5d3e3a02c647dc7e9cb3d73509b35826ab45..36dc947c06091a2783c21d47059f05aa18b575a4 100644 (file)
@@ -361,7 +361,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
                                  struct bucket *g, struct btree_iter *iter,
                                  u64 *journal_seq)
 {
-       struct bucket_mark m = READ_ONCE(g->mark);
+       struct bucket_mark m;
        __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
        struct bkey_i_alloc *a;
        u8 *d;
@@ -374,6 +374,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
                if (ret)
                        break;
 
+               /* read mark under btree node lock: */
+               m = READ_ONCE(g->mark);
                a = bkey_alloc_init(&alloc_key.k);
                a->k.p          = iter->pos;
                a->v.fields     = 0;
@@ -407,8 +409,6 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
        struct btree_iter iter;
        int ret;
 
-       lockdep_assert_held(&c->state_lock);
-
        if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
                return 0;
 
index 9d04e894175b3e73c2d847bf1410f8f730f84236..4d0fc62c7e01f818d91c412744a675b9ca19fcde 100644 (file)
@@ -725,6 +725,10 @@ struct bch_fs {
        struct work_struct      read_retry_work;
        spinlock_t              read_retry_lock;
 
+       struct bio_list         btree_write_error_list;
+       struct work_struct      btree_write_error_work;
+       spinlock_t              btree_write_error_lock;
+
        /* ERRORS */
        struct list_head        fsck_errors;
        struct mutex            fsck_error_lock;
index 3f6d51acb3b6fea23f07400fec5df5fcf35fb8fb..125b6fabb38b6f1dc8c49877489df4fab64d29ed 100644 (file)
@@ -1082,7 +1082,8 @@ struct jset_entry {
        __le16                  u64s;
        __u8                    btree_id;
        __u8                    level;
-       __le32                  flags; /* designates what this jset holds */
+       __u8                    type; /* designates what this jset holds */
+       __u8                    pad[3];
 
        union {
                struct bkey_i   start[0];
@@ -1092,7 +1093,6 @@ struct jset_entry {
 
 #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
 
-LE32_BITMASK(JOURNAL_ENTRY_TYPE,       struct jset_entry, flags, 0, 8);
 enum {
        JOURNAL_ENTRY_BTREE_KEYS                = 0,
        JOURNAL_ENTRY_BTREE_ROOT                = 1,
index b9ceb6ead6aaf7cd6b06fee09d19ce56f28444aa..cc76257ebb79a3d3b4d80c6cf79ca13386e1cb33 100644 (file)
@@ -791,11 +791,9 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                              unsigned dst_offset, unsigned dst_size,
                              bool *eax_zeroed)
 {
-       unsigned byte = format->key_u64s * sizeof(u64);
        unsigned bits = format->bits_per_field[field];
        u64 offset = format->field_offset[field];
-       unsigned i, bit_offset = 0;
-       unsigned shl, shr;
+       unsigned i, byte, bit_offset, align, shl, shr;
 
        if (!bits && !offset) {
                if (!*eax_zeroed) {
@@ -842,11 +840,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                return out;
        }
 
+       bit_offset = format->key_u64s * 64;
        for (i = 0; i <= field; i++)
-               bit_offset += format->bits_per_field[i];
+               bit_offset -= format->bits_per_field[i];
 
-       byte -= DIV_ROUND_UP(bit_offset, 8);
-       bit_offset = round_up(bit_offset, 8) - bit_offset;
+       byte = bit_offset / 8;
+       bit_offset -= byte * 8;
 
        *eax_zeroed = false;
 
@@ -857,6 +856,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                /* movzx eax, WORD PTR [rsi + imm8] */
                I4(0x0f, 0xb7, 0x46, byte);
        } else if (bit_offset + bits <= 32) {
+               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 32);
+
                /* mov eax, [rsi + imm8] */
                I3(0x8b, 0x46, byte);
 
@@ -874,6 +879,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                        out += 4;
                }
        } else if (bit_offset + bits <= 64) {
+               align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 64);
+
                /* mov rax, [rsi + imm8] */
                I4(0x48, 0x8b, 0x46, byte);
 
@@ -890,6 +901,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                        I4(0x48, 0xc1, 0xe8, shr);
                }
        } else {
+               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+               byte -= align;
+               bit_offset += align * 8;
+
+               BUG_ON(bit_offset + bits > 96);
+
                /* mov rax, [rsi + byte] */
                I4(0x48, 0x8b, 0x46, byte);
 
index bdbe21accfc0e7886831c1754331312b4cede1a9..d619f375689b0a83c05c87c1e882e7c71db1fe51 100644 (file)
@@ -41,7 +41,7 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b)
 {
        EBUG_ON(btree_node_write_in_flight(b));
 
-       free_pages((unsigned long) b->data, btree_page_order(c));
+       kvpfree(b->data, btree_bytes(c));
        b->data = NULL;
        bch2_btree_keys_free(b);
 }
@@ -53,8 +53,6 @@ static void mca_data_free(struct bch_fs *c, struct btree *b)
        list_move(&b->list, &c->btree_cache_freed);
 }
 
-#define PTR_HASH(_k)   (bkey_i_to_extent_c(_k)->v._data[0])
-
 static const struct rhashtable_params bch_btree_cache_params = {
        .head_offset    = offsetof(struct btree, hash),
        .key_offset     = offsetof(struct btree, key.v),
@@ -63,20 +61,18 @@ static const struct rhashtable_params bch_btree_cache_params = {
 
 static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
-       unsigned order = ilog2(btree_pages(c));
-
-       b->data = (void *) __get_free_pages(gfp, order);
+       b->data = kvpmalloc(btree_bytes(c), gfp);
        if (!b->data)
                goto err;
 
-       if (bch2_btree_keys_alloc(b, order, gfp))
+       if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
                goto err;
 
        c->btree_cache_used++;
        list_move(&b->list, &c->btree_cache_freeable);
        return;
 err:
-       free_pages((unsigned long) b->data, order);
+       kvpfree(b->data, btree_bytes(c));
        b->data = NULL;
        list_move(&b->list, &c->btree_cache_freed);
 }
@@ -91,7 +87,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
        six_lock_init(&b->lock);
        INIT_LIST_HEAD(&b->list);
        INIT_LIST_HEAD(&b->write_blocked);
-       INIT_LIST_HEAD(&b->reachable);
 
        mca_data_alloc(c, b, gfp);
        return b->data ? b : NULL;
@@ -101,10 +96,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
 
 void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
 {
-       BUG_ON(btree_node_dirty(b));
-
-       b->nsets = 0;
-
        rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
                               bch_btree_cache_params);
 
@@ -112,23 +103,27 @@ void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
        bkey_i_to_extent(&b->key)->v._data[0] = 0;
 }
 
+int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b)
+{
+       return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
+                                            bch_btree_cache_params);
+}
+
 int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
                    unsigned level, enum btree_id id)
 {
        int ret;
+
        b->level        = level;
        b->btree_id     = id;
 
-       ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
-                                           bch_btree_cache_params);
-       if (ret)
-               return ret;
-
        mutex_lock(&c->btree_cache_lock);
-       list_add(&b->list, &c->btree_cache);
+       ret = __bch2_btree_node_hash_insert(c, b);
+       if (!ret)
+               list_add(&b->list, &c->btree_cache);
        mutex_unlock(&c->btree_cache_lock);
 
-       return 0;
+       return ret;
 }
 
 __flatten
@@ -155,8 +150,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
        if (!six_trylock_write(&b->lock))
                goto out_unlock_intent;
 
-       if (btree_node_write_error(b) ||
-           btree_node_noevict(b))
+       if (btree_node_noevict(b))
                goto out_unlock;
 
        if (!btree_node_may_write(b))
@@ -328,7 +322,7 @@ void bch2_fs_btree_exit(struct bch_fs *c)
        if (c->verify_data)
                list_move(&c->verify_data->list, &c->btree_cache);
 
-       free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
+       kvpfree(c->verify_ondisk, btree_bytes(c));
 #endif
 
        for (i = 0; i < BTREE_ID_NR; i++)
@@ -384,8 +378,7 @@ int bch2_fs_btree_init(struct bch_fs *c)
 #ifdef CONFIG_BCACHEFS_DEBUG
        mutex_init(&c->verify_lock);
 
-       c->verify_ondisk = (void *)
-               __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
+       c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
        if (!c->verify_ondisk)
                return -ENOMEM;
 
@@ -510,7 +503,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
        BUG_ON(!six_trylock_intent(&b->lock));
        BUG_ON(!six_trylock_write(&b->lock));
 out_unlock:
-       BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
+       BUG_ON(btree_node_hashed(b));
        BUG_ON(btree_node_write_in_flight(b));
 
        list_del_init(&b->list);
@@ -554,6 +547,12 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
        struct bch_fs *c = iter->c;
        struct btree *b;
 
+       /*
+        * Parent node must be locked, else we could read in a btree node that's
+        * been freed:
+        */
+       BUG_ON(!btree_node_locked(iter, level + 1));
+
        b = bch2_btree_node_mem_alloc(c);
        if (IS_ERR(b))
                return b;
index ca8e3195203e12cef752b63a692db5e0a8bc15c1..ea53d2b20e09f14b0d91c1bb2ef09dc41af68b04 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "btree_types.h"
+#include "extents.h"
 
 struct btree_iter;
 
@@ -11,6 +12,7 @@ extern const char * const bch2_btree_ids[];
 void bch2_recalc_btree_reserve(struct bch_fs *);
 
 void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *);
+int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *);
 int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *,
                                unsigned, enum btree_id);
 
@@ -28,6 +30,14 @@ void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *,
 void bch2_fs_btree_exit(struct bch_fs *);
 int bch2_fs_btree_init(struct bch_fs *);
 
+#define PTR_HASH(_k)   (bkey_i_to_extent_c(_k)->v._data[0])
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+       return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+}
+
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)               \
        for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,  \
                                          &(_c)->btree_cache_table),    \
index 815260bc25809de0e1bcf6c2e55958df77d2135d..376edaf3d461226b4dd8100ed06fcb8f807e65f5 100644 (file)
@@ -621,12 +621,10 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
 
        /* Repack everything with @new_format and sort down to one bset */
-       for (i = 0; i < nr_old_nodes; i++) {
+       for (i = 0; i < nr_old_nodes; i++)
                new_nodes[i] =
                        __bch2_btree_node_alloc_replacement(c, old_nodes[i],
-                                                           new_format, res);
-               list_add(&new_nodes[i]->reachable, &as->reachable_list);
-       }
+                                                           new_format, as, res);
 
        /*
         * Conceptually we concatenate the nodes together and slice them
@@ -663,7 +661,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
 
                        set_btree_bset_end(n1, n1->set);
 
-                       list_del_init(&n2->reachable);
                        six_unlock_write(&n2->lock);
                        bch2_btree_node_free_never_inserted(c, n2);
                        six_unlock_intent(&n2->lock);
@@ -796,7 +793,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
        memset(merge, 0, sizeof(merge));
 
        __for_each_btree_node(&iter, c, btree_id, POS_MIN,
-                             U8_MAX, 0, BTREE_ITER_PREFETCH, b) {
+                             BTREE_MAX_DEPTH, 0,
+                             BTREE_ITER_PREFETCH, b) {
                memmove(merge + 1, merge,
                        sizeof(merge) - sizeof(merge[0]));
                memmove(lock_seq + 1, lock_seq,
index 571a8140369c58f509f6a9ad75a7e1ad36769772..eeb546efd2754f465862106ed4d83e7c1c5323e1 100644 (file)
@@ -56,9 +56,9 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order,
                              bool used_mempool, void *p)
 {
        if (used_mempool)
-               mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+               mempool_free(p, &c->btree_bounce_pool);
        else
-               free_pages((unsigned long) p, order);
+               vpfree(p, PAGE_SIZE << order);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
@@ -66,7 +66,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 {
        void *p;
 
-       BUG_ON(1 << order > btree_pages(c));
+       BUG_ON(order > btree_page_order(c));
 
        *used_mempool = false;
        p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
@@ -74,7 +74,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
                return p;
 
        *used_mempool = true;
-       return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+       return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
 }
 
 typedef int (*sort_cmp_fn)(struct btree *,
@@ -1183,7 +1183,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
                if (bne->keys.seq == b->data->keys.seq)
                        goto err;
 
-       sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+       sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
        sorted->keys.u64s = 0;
 
        b->nr = btree_node_is_extents(b)
@@ -1199,7 +1199,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
 
        BUG_ON(b->nr.live_u64s != u64s);
 
-       btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+       btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
 
        bch2_bset_build_aux_tree(b, b->set, false);
 
@@ -1344,50 +1344,100 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
 
-       /*
-        * Before calling bch2_btree_complete_write() - if the write errored, we
-        * have to halt new journal writes before they see this btree node
-        * write as completed:
-        */
-       if (btree_node_write_error(b))
-               bch2_journal_halt(&c->journal);
-
        bch2_btree_complete_write(c, b, w);
        btree_node_io_unlock(b);
 }
 
-static void btree_node_write_endio(struct bio *bio)
+static void bch2_btree_node_write_error(struct bch_fs *c,
+                                       struct bch_write_bio *wbio)
 {
-       struct btree *b = bio->bi_private;
-       struct bch_write_bio *wbio = to_wbio(bio);
-       struct bch_fs *c        = wbio->c;
-       struct bio *orig        = wbio->split ? wbio->orig : NULL;
-       struct closure *cl      = !wbio->split ? wbio->cl : NULL;
-       struct bch_dev *ca      = wbio->ca;
+       struct btree *b         = wbio->bio.bi_private;
+       struct closure *cl      = wbio->cl;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+       struct bkey_i_extent *new_key;
+
+       bkey_copy(&tmp.k, &b->key);
+       new_key = bkey_i_to_extent(&tmp.k);
+
+       while (wbio->replicas_failed) {
+               unsigned idx = __fls(wbio->replicas_failed);
+
+               bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
+               wbio->replicas_failed ^= 1 << idx;
+       }
+
+       if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
+           bch2_btree_node_update_key(c, b, new_key)) {
+               set_btree_node_noevict(b);
+               bch2_fatal_error(c);
+       }
+
+       bio_put(&wbio->bio);
+       btree_node_write_done(c, b);
+       if (cl)
+               closure_put(cl);
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs,
+                                       btree_write_error_work);
+       struct bio *bio;
+
+       while (1) {
+               spin_lock_irq(&c->read_retry_lock);
+               bio = bio_list_pop(&c->read_retry_list);
+               spin_unlock_irq(&c->read_retry_lock);
 
-       if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+               if (!bio)
+                       break;
+
+               bch2_btree_node_write_error(c, to_wbio(bio));
+       }
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+       struct btree *b                 = bio->bi_private;
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_write_bio *orig      = parent ?: wbio;
+       struct closure *cl              = !wbio->split ? wbio->cl : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = wbio->ca;
+
+       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") ||
            bch2_meta_write_fault("btree"))
-               set_btree_node_write_error(b);
+               set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
 
        if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
 
-       if (wbio->bounce)
-               btree_bounce_free(c,
-                       wbio->order,
-                       wbio->used_mempool,
-                       page_address(bio->bi_io_vec[0].bv_page));
-
-       if (wbio->put_bio)
+       if (parent) {
                bio_put(bio);
+               bio_endio(&parent->bio);
+               return;
+       }
 
-       if (orig) {
-               bio_endio(orig);
-       } else {
-               btree_node_write_done(c, b);
-               if (cl)
-                       closure_put(cl);
+       btree_bounce_free(c,
+               wbio->order,
+               wbio->used_mempool,
+               wbio->data);
+
+       if (wbio->replicas_failed) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&c->btree_write_error_lock, flags);
+               bio_list_add(&c->read_retry_list, &wbio->bio);
+               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+               queue_work(c->wq, &c->btree_write_error_work);
+               return;
        }
+
+       bio_put(bio);
+       btree_node_write_done(c, b);
+       if (cl)
+               closure_put(cl);
 }
 
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1411,7 +1461,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                            struct closure *parent,
                            enum six_lock_type lock_type_held)
 {
-       struct bio *bio;
        struct bch_write_bio *wbio;
        struct bset_tree *t;
        struct bset *i;
@@ -1458,7 +1507,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
        BUG_ON(!list_empty(&b->write_blocked));
-       BUG_ON(!list_empty_careful(&b->reachable) != !b->written);
+       BUG_ON((b->will_make_reachable != NULL) != !b->written);
 
        BUG_ON(b->written >= c->sb.btree_node_size);
        BUG_ON(bset_written(b, btree_bset_last(b)));
@@ -1601,23 +1650,20 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        trace_btree_write(b, bytes_to_write, sectors_to_write);
 
-       bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
-
-       wbio                    = to_wbio(bio);
+       wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
        wbio->cl                = parent;
-       wbio->bounce            = true;
-       wbio->put_bio           = true;
        wbio->order             = order;
        wbio->used_mempool      = used_mempool;
-       bio->bi_opf             = REQ_OP_WRITE|REQ_META|REQ_FUA;
-       bio->bi_iter.bi_size    = sectors_to_write << 9;
-       bio->bi_end_io          = btree_node_write_endio;
-       bio->bi_private         = b;
+       wbio->data              = data;
+       wbio->bio.bi_opf        = REQ_OP_WRITE|REQ_META|REQ_FUA;
+       wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
+       wbio->bio.bi_end_io     = btree_node_write_endio;
+       wbio->bio.bi_private    = b;
 
        if (parent)
                closure_get(parent);
 
-       bch2_bio_map(bio, data);
+       bch2_bio_map(&wbio->bio, data);
 
        /*
         * If we're appending to a leaf node, we don't technically need FUA -
index 7333f3052c65f2db03b410ff36f54763cc78efdc..91263eeac48e1456c6ecd96cd9439522e33cc9d1 100644 (file)
@@ -37,7 +37,7 @@ static inline void btree_node_wait_on_io(struct btree *b)
 static inline bool btree_node_may_write(struct btree *b)
 {
        return list_empty_careful(&b->write_blocked) &&
-               list_empty_careful(&b->reachable);
+               !b->will_make_reachable;
 }
 
 enum compact_mode {
@@ -79,6 +79,7 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *,
                            struct closure *, enum six_lock_type);
index ecad24fa4add315b0e76de4917ef6af39f07af07..46df99fef6aa6fe0fd181b7ade14c900c4c4a225 100644 (file)
@@ -252,6 +252,8 @@ static int __bch2_btree_iter_unlock(struct btree_iter *iter)
        while (iter->nodes_locked)
                btree_node_unlock(iter, __ffs(iter->nodes_locked));
 
+       iter->flags &= ~BTREE_ITER_UPTODATE;
+
        return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }
 
@@ -1006,16 +1008,30 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
                iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
 
        iter->pos = new_pos;
+       iter->flags &= ~BTREE_ITER_UPTODATE;
 }
 
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
        EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
        iter->pos = new_pos;
+       iter->flags &= ~BTREE_ITER_UPTODATE;
 }
 
 void bch2_btree_iter_advance_pos(struct btree_iter *iter)
 {
+       if (iter->flags & BTREE_ITER_UPTODATE &&
+           !(iter->flags & BTREE_ITER_WITH_HOLES)) {
+               struct bkey_s_c k;
+
+               __btree_iter_advance(iter);
+               k = __btree_iter_peek(iter);
+               if (likely(k.k)) {
+                       iter->pos = bkey_start_pos(k.k);
+                       return;
+               }
+       }
+
        /*
         * We use iter->k instead of iter->pos for extents: iter->pos will be
         * equal to the start of the extent we returned, but we need to advance
@@ -1032,6 +1048,7 @@ void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
        BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
 
        iter->pos = pos;
+       iter->flags &= ~BTREE_ITER_UPTODATE;
        __btree_iter_init(iter, iter->nodes[iter->level]);
 }
 
@@ -1043,6 +1060,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS));
 
+       if (iter->flags & BTREE_ITER_UPTODATE) {
+               struct btree *b = iter->nodes[0];
+               struct bkey_packed *k =
+                       __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b);
+
+               return (struct bkey_s_c) {
+                       .k = &iter->k,
+                       .v = bkeyp_val(&b->format, k)
+               };
+       }
+
        while (1) {
                ret = bch2_btree_iter_traverse(iter);
                if (unlikely(ret)) {
@@ -1058,7 +1086,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                         */
                        if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
                            bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-                               bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+                               iter->pos = bkey_start_pos(k.k);
+
+                       iter->flags |= BTREE_ITER_UPTODATE;
                        return k;
                }
 
@@ -1083,6 +1113,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter)
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS));
 
+       iter->flags &= ~BTREE_ITER_UPTODATE;
+
        while (1) {
                ret = bch2_btree_iter_traverse(iter);
                if (unlikely(ret)) {
@@ -1131,12 +1163,15 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
                            unsigned locks_want, unsigned depth,
                            unsigned flags)
 {
+       EBUG_ON(depth >= BTREE_MAX_DEPTH);
+       EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+
        iter->c                         = c;
        iter->pos                       = pos;
        iter->flags                     = flags;
        iter->btree_id                  = btree_id;
        iter->level                     = depth;
-       iter->locks_want                = min(locks_want, BTREE_MAX_DEPTH);
+       iter->locks_want                = locks_want;
        iter->nodes_locked              = 0;
        iter->nodes_intent_locked       = 0;
        memset(iter->nodes, 0, sizeof(iter->nodes));
index 57f38765f72eaba5a48a24fc00eace1739f98c2f..34e5035e3ded56ea9a1e39dc9d6beba21e5e95f1 100644 (file)
@@ -4,19 +4,20 @@
 #include "btree_types.h"
 
 
-#define BTREE_ITER_INTENT              (1 << 0)
+#define BTREE_ITER_UPTODATE            (1 << 0)
 #define BTREE_ITER_WITH_HOLES          (1 << 1)
-#define BTREE_ITER_PREFETCH            (1 << 2)
+#define BTREE_ITER_INTENT              (1 << 2)
+#define BTREE_ITER_PREFETCH            (1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS          (1 << 3)
+#define BTREE_ITER_IS_EXTENTS          (1 << 4)
 /*
  * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
  */
-#define BTREE_ITER_AT_END_OF_LEAF      (1 << 4)
-#define BTREE_ITER_ERROR               (1 << 5)
+#define BTREE_ITER_AT_END_OF_LEAF      (1 << 5)
+#define BTREE_ITER_ERROR               (1 << 6)
 
 /*
  * @pos                        - iterator's current position
@@ -223,17 +224,23 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
 #define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)  \
        __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
 
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+                                                    unsigned flags)
+{
+       return flags & BTREE_ITER_WITH_HOLES
+               ? bch2_btree_iter_peek_with_holes(iter)
+               : bch2_btree_iter_peek(iter);
+}
+
 #define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)  \
-       for (bch2_btree_iter_init((_iter), (_c), (_btree_id),   \
-                                 (_start), (_flags));          \
-            !IS_ERR_OR_NULL(((_k) = (((_flags) & BTREE_ITER_WITH_HOLES)\
-                               ? bch2_btree_iter_peek_with_holes(_iter)\
-                               : bch2_btree_iter_peek(_iter))).k);     \
+       for (bch2_btree_iter_init((_iter), (_c), (_btree_id),           \
+                                 (_start), (_flags));                  \
+            !IS_ERR_OR_NULL(((_k) = __bch2_btree_iter_peek(_iter, _flags)).k);\
             bch2_btree_iter_advance_pos(_iter))
 
 static inline int btree_iter_err(struct bkey_s_c k)
 {
-       return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
+       return PTR_ERR_OR_ZERO(k.k);
 }
 
 /*
index c613a7bc8335f4e13a228c6450e6297b331636cd..7085feb8565efe3877aa74fc2e52e1719684a055 100644 (file)
@@ -116,7 +116,7 @@ struct btree {
         * another write - because that write also won't yet be reachable and
         * marking it as completed before it's reachable would be incorrect:
         */
-       struct list_head        reachable;
+       struct btree_interior_update *will_make_reachable;
 
        struct open_bucket      *ob;
 
@@ -143,7 +143,6 @@ static inline void clear_btree_node_ ## flag(struct btree *b)               \
 enum btree_flags {
        BTREE_NODE_read_in_flight,
        BTREE_NODE_read_error,
-       BTREE_NODE_write_error,
        BTREE_NODE_dirty,
        BTREE_NODE_need_write,
        BTREE_NODE_noevict,
@@ -155,7 +154,6 @@ enum btree_flags {
 
 BTREE_FLAG(read_in_flight);
 BTREE_FLAG(read_error);
-BTREE_FLAG(write_error);
 BTREE_FLAG(dirty);
 BTREE_FLAG(need_write);
 BTREE_FLAG(noevict);
index 9794ac3b3d38a3e8c59ebcce3cbd5d29af323552..c7b2018491f090237ab463d47d71401f116bf6fd 100644 (file)
 static void btree_interior_update_updated_root(struct bch_fs *,
                                               struct btree_interior_update *,
                                               enum btree_id);
+static void btree_interior_update_will_make_reachable(struct bch_fs *,
+                               struct btree_interior_update *,
+                               struct btree *);
+static void btree_interior_update_drop_new_node(struct bch_fs *,
+                                               struct btree *);
 
 /* Calculate ideal packed bkey format for new btree nodes: */
 
@@ -166,7 +171,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
        BUG_ON(b == btree_node_root(c, b));
        BUG_ON(b->ob);
        BUG_ON(!list_empty(&b->write_blocked));
-       BUG_ON(!list_empty(&b->reachable));
+       BUG_ON(b->will_make_reachable);
 
        clear_btree_node_noevict(b);
 
@@ -191,6 +196,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
        struct open_bucket *ob = b->ob;
 
+       btree_interior_update_drop_new_node(c, b);
+
        b->ob = NULL;
 
        clear_btree_node_dirty(b);
@@ -299,6 +306,7 @@ mem_alloc:
 
 static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
                                          unsigned level, enum btree_id id,
+                                         struct btree_interior_update *as,
                                          struct btree_reserve *reserve)
 {
        struct btree *b;
@@ -322,7 +330,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
 
        bch2_btree_build_aux_trees(b);
 
-       bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
+       btree_interior_update_will_make_reachable(c, as, b);
 
        trace_btree_node_alloc(c, b);
        return b;
@@ -331,11 +339,12 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
 struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c,
                                                  struct btree *b,
                                                  struct bkey_format format,
+                                                 struct btree_interior_update *as,
                                                  struct btree_reserve *reserve)
 {
        struct btree *n;
 
-       n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
+       n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve);
 
        n->data->min_key        = b->data->min_key;
        n->data->max_key        = b->data->max_key;
@@ -353,6 +362,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c,
 
 static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c,
                                                struct btree *b,
+                                               struct btree_interior_update *as,
                                                struct btree_reserve *reserve)
 {
        struct bkey_format new_f = bch2_btree_calc_format(b);
@@ -364,7 +374,7 @@ static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c,
        if (!bch2_btree_node_format_fits(c, b, &new_f))
                new_f = b->format;
 
-       return __bch2_btree_node_alloc_replacement(c, b, new_f, reserve);
+       return __bch2_btree_node_alloc_replacement(c, b, new_f, as, reserve);
 }
 
 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b,
@@ -478,9 +488,10 @@ static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b,
 
 static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level,
                                        enum btree_id id,
+                                       struct btree_interior_update *as,
                                        struct btree_reserve *reserve)
 {
-       struct btree *b = bch2_btree_node_alloc(c, level, id, reserve);
+       struct btree *b = bch2_btree_node_alloc(c, level, id, as, reserve);
 
        b->data->min_key = POS_MIN;
        b->data->max_key = POS_MAX;
@@ -581,6 +592,11 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
                        goto err_free;
                }
 
+               ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+                                           BCH_DATA_BTREE);
+               if (ret)
+                       goto err_free;
+
                reserve->b[reserve->nr++] = b;
        }
 
@@ -608,11 +624,12 @@ struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
                         struct closure *writes)
 {
-       struct closure cl;
+       struct btree_interior_update as;
        struct btree_reserve *reserve;
+       struct closure cl;
        struct btree *b;
-       LIST_HEAD(reachable_list);
 
+       memset(&as, 0, sizeof(as));
        closure_init_stack(&cl);
 
        while (1) {
@@ -627,15 +644,14 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
                closure_sync(&cl);
        }
 
-       b = __btree_root_alloc(c, 0, id, reserve);
-       list_add(&b->reachable, &reachable_list);
+       b = __btree_root_alloc(c, 0, id, &as, reserve);
 
        bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
 
        bch2_btree_set_root_initial(c, b, reserve);
-       bch2_btree_open_bucket_put(c, b);
 
-       list_del_init(&b->reachable);
+       btree_interior_update_drop_new_node(c, b);
+       bch2_btree_open_bucket_put(c, b);
        six_unlock_intent(&b->lock);
 
        bch2_btree_reserve_put(c, reserve);
@@ -819,9 +835,12 @@ void bch2_btree_journal_key(struct btree_insert *trans,
                /* ick */
                insert->k.needs_whiteout = false;
                bch2_journal_add_keys(j, &trans->journal_res,
-                                    b->btree_id, insert);
+                                     b->btree_id, insert);
                insert->k.needs_whiteout = needs_whiteout;
 
+               bch2_journal_set_has_inode(j, &trans->journal_res,
+                                          insert->k.p.inode);
+
                if (trans->journal_seq)
                        *trans->journal_seq = seq;
                btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
@@ -891,7 +910,6 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
        as->c           = c;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
        INIT_LIST_HEAD(&as->write_blocked_list);
-       INIT_LIST_HEAD(&as->reachable_list);
 
        bch2_keylist_init(&as->parent_keys, as->inline_keys,
                         ARRAY_SIZE(as->inline_keys));
@@ -916,16 +934,16 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
        struct btree_interior_update *as =
                container_of(cl, struct btree_interior_update, cl);
        struct bch_fs *c = as->c;
-       unsigned i;
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
        mutex_lock(&c->btree_interior_update_lock);
 
-       while (!list_empty(&as->reachable_list)) {
-               struct btree *b = list_first_entry(&as->reachable_list,
-                                                  struct btree, reachable);
-               list_del_init(&b->reachable);
+       while (as->nr_new_nodes) {
+               struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+               BUG_ON(b->will_make_reachable != as);
+               b->will_make_reachable = NULL;
                mutex_unlock(&c->btree_interior_update_lock);
 
                six_lock_read(&b->lock);
@@ -934,9 +952,8 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
                mutex_lock(&c->btree_interior_update_lock);
        }
 
-       for (i = 0; i < as->nr_pending; i++)
-               bch2_btree_node_free_ondisk(c, &as->pending[i]);
-       as->nr_pending = 0;
+       while (as->nr_pending)
+               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
 
        list_del(&as->list);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1185,6 +1202,68 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
                    system_freezable_wq);
 }
 
+static void btree_interior_update_will_make_reachable(struct bch_fs *c,
+                               struct btree_interior_update *as,
+                               struct btree *b)
+{
+       mutex_lock(&c->btree_interior_update_lock);
+       BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+       BUG_ON(b->will_make_reachable);
+
+       as->new_nodes[as->nr_new_nodes++] = b;
+       b->will_make_reachable = as;
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_interior_update_drop_new_node(struct btree *b)
+{
+       struct btree_interior_update *as = b->will_make_reachable;
+       unsigned i;
+
+       BUG_ON(!as);
+
+       for (i = 0; i < as->nr_new_nodes; i++)
+               if (as->new_nodes[i] == b)
+                       goto found;
+
+       BUG();
+found:
+       as->nr_new_nodes--;
+       memmove(&as->new_nodes[i],
+               &as->new_nodes[i + 1],
+               sizeof(struct btree *) * (as->nr_new_nodes - i));
+       b->will_make_reachable = NULL;
+}
+
+static void btree_interior_update_drop_new_node(struct bch_fs *c,
+                                               struct btree *b)
+{
+       mutex_lock(&c->btree_interior_update_lock);
+       __btree_interior_update_drop_new_node(b);
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_btree_interior_update_add_node_reference(struct bch_fs *c,
+                                                  struct btree_interior_update *as,
+                                                  struct btree *b)
+{
+       struct pending_btree_node_free *d;
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       /* Add this node to the list of nodes being freed: */
+       BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+       d = &as->pending[as->nr_pending++];
+       d->index_update_done    = false;
+       d->seq                  = b->data->keys.seq;
+       d->btree_id             = b->btree_id;
+       d->level                = b->level;
+       bkey_copy(&d->key, &b->key);
+
+       mutex_unlock(&c->btree_interior_update_lock);
+}
+
 /*
  * @b is being split/rewritten: it may have pointers to not-yet-written btree
  * nodes and thus outstanding btree_interior_updates - redirect @b's
@@ -1196,10 +1275,11 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
 {
        struct closure *cl, *cl_n;
        struct btree_interior_update *p, *n;
-       struct pending_btree_node_free *d;
        struct btree_write *w;
        struct bset_tree *t;
 
+       bch2_btree_interior_update_add_node_reference(c, as, b);
+
        /*
         * Does this node have data that hasn't been written in the journal?
         *
@@ -1213,16 +1293,6 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
 
        mutex_lock(&c->btree_interior_update_lock);
 
-       /* Add this node to the list of nodes being freed: */
-       BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
-       d = &as->pending[as->nr_pending++];
-       d->index_update_done    = false;
-       d->seq                  = b->data->keys.seq;
-       d->btree_id             = b->btree_id;
-       d->level                = b->level;
-       bkey_copy(&d->key, &b->key);
-
        /*
         * Does this node have any btree_interior_update operations preventing
         * it from being written?
@@ -1255,8 +1325,13 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
                                      &as->journal, interior_update_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
-       if (!list_empty(&b->reachable))
-               list_del_init(&b->reachable);
+       w = btree_prev_write(b);
+       bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+                                     &as->journal, interior_update_flush);
+       bch2_journal_pin_drop(&c->journal, &w->journal);
+
+       if (b->will_make_reachable)
+               __btree_interior_update_drop_new_node(b);
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1301,7 +1376,7 @@ err:
 #endif
 }
 
-static enum btree_insert_ret
+static int
 bch2_btree_insert_keys_interior(struct btree *b,
                               struct btree_iter *iter,
                               struct keylist *insert_keys,
@@ -1324,7 +1399,7 @@ bch2_btree_insert_keys_interior(struct btree *b,
        if (bch_keylist_u64s(insert_keys) >
            bch_btree_keys_u64s_remaining(c, b)) {
                bch2_btree_node_unlock_write(b, iter);
-               return BTREE_INSERT_BTREE_NODE_FULL;
+               return -1;
        }
 
        /* Don't screw up @iter's position: */
@@ -1362,7 +1437,7 @@ bch2_btree_insert_keys_interior(struct btree *b,
        bch2_btree_node_unlock_write(b, iter);
 
        btree_node_interior_verify(b);
-       return BTREE_INSERT_OK;
+       return 0;
 }
 
 /*
@@ -1373,13 +1448,13 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
                                        struct btree_reserve *reserve,
                                        struct btree_interior_update *as)
 {
+       struct bch_fs *c = iter->c;
        size_t nr_packed = 0, nr_unpacked = 0;
        struct btree *n2;
        struct bset *set1, *set2;
        struct bkey_packed *k, *prev = NULL;
 
-       n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
-       list_add(&n2->reachable, &as->reachable_list);
+       n2 = bch2_btree_node_alloc(c, n1->level, iter->btree_id, as, reserve);
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
@@ -1528,8 +1603,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
 
        bch2_btree_interior_update_will_free_node(c, as, b);
 
-       n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
-       list_add(&n1->reachable, &as->reachable_list);
+       n1 = bch2_btree_node_alloc_replacement(c, b, as, reserve);
 
        if (b->level)
                btree_split_insert_keys(iter, n1, insert_keys, reserve);
@@ -1558,8 +1632,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
                        /* Depth increases, make a new root */
                        n3 = __btree_root_alloc(c, b->level + 1,
                                                iter->btree_id,
-                                               reserve);
-                       list_add(&n3->reachable, &as->reachable_list);
+                                               as, reserve);
 
                        n3->sib_u64s[0] = U16_MAX;
                        n3->sib_u64s[1] = U16_MAX;
@@ -1641,16 +1714,10 @@ void bch2_btree_insert_node(struct btree *b,
        BUG_ON(!b->level);
        BUG_ON(!reserve || !as);
 
-       switch (bch2_btree_insert_keys_interior(b, iter, insert_keys,
-                                              as, reserve)) {
-       case BTREE_INSERT_OK:
-               break;
-       case BTREE_INSERT_BTREE_NODE_FULL:
+       if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) ||
+           bch2_btree_insert_keys_interior(b, iter, insert_keys,
+                                           as, reserve))
                btree_split(b, iter, insert_keys, reserve, as);
-               break;
-       default:
-               BUG();
-       }
 }
 
 static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags)
@@ -1859,8 +1926,7 @@ retry:
        bch2_btree_interior_update_will_free_node(c, as, b);
        bch2_btree_interior_update_will_free_node(c, as, m);
 
-       n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
-       list_add(&n->reachable, &as->reachable_list);
+       n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve);
 
        n->data->min_key        = prev->data->min_key;
        n->data->max_key        = next->data->max_key;
@@ -1945,6 +2011,8 @@ btree_insert_key(struct btree_insert *trans,
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
+       iter->flags &= ~BTREE_ITER_UPTODATE;
+
        ret = !btree_node_is_extents(b)
                ? bch2_insert_fixup_key(trans, insert)
                : bch2_insert_fixup_extent(trans, insert);
@@ -2383,8 +2451,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        bch2_btree_interior_update_will_free_node(c, as, b);
 
-       n = bch2_btree_node_alloc_replacement(c, b, reserve);
-       list_add(&n->reachable, &as->reachable_list);
+       n = bch2_btree_node_alloc_replacement(c, b, as, reserve);
 
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->lock);
@@ -2464,3 +2531,140 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        closure_sync(&cl);
        return ret;
 }
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
+                              struct bkey_i_extent *new_key)
+{
+       struct btree_interior_update *as;
+       struct btree_reserve *reserve = NULL;
+       struct btree *parent, *new_hash = NULL;
+       struct btree_iter iter;
+       struct closure cl;
+       bool must_rewrite_parent = false;
+       int ret;
+
+       __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+                              BTREE_MAX_DEPTH,
+                              b->level, 0);
+       closure_init_stack(&cl);
+
+       if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+               /* bch2_btree_reserve_get will unlock */
+               do {
+                       ret = bch2_btree_node_cannibalize_lock(c, &cl);
+                       closure_sync(&cl);
+               } while (ret == -EAGAIN);
+
+               BUG_ON(ret);
+
+               new_hash = bch2_btree_node_mem_alloc(c);
+       }
+retry:
+       reserve = bch2_btree_reserve_get(c, b, 0,
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_USE_RESERVE|
+                               BTREE_INSERT_USE_ALLOC_RESERVE,
+                               &cl);
+       closure_sync(&cl);
+       if (IS_ERR(reserve)) {
+               ret = PTR_ERR(reserve);
+               if (ret == -EAGAIN || ret == -EINTR)
+                       goto retry;
+               goto err;
+       }
+
+       down_read(&c->gc_lock);
+
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto err;
+
+       mutex_lock(&c->btree_interior_update_lock);
+
+       /*
+        * Two corner cases that need to be thought about here:
+        *
+        * @b may not be reachable yet - there might be another interior update
+        * operation waiting on @b to be written, and we're gonna deliver the
+        * write completion to that interior update operation _before_
+        * persisting the new_key update
+        *
+        * That ends up working without us having to do anything special here:
+        * the reason is, we do kick off (and do the in memory updates) for the
+        * update for @new_key before we return, creating a new interior_update
+        * operation here.
+        *
+        * The new interior update operation here will in effect override the
+        * previous one. The previous one was going to terminate - make @b
+        * reachable - in one of two ways:
+        * - updating the btree root pointer
+        *   In that case,
+        *   no, this doesn't work. argh.
+        */
+
+       if (b->will_make_reachable)
+               must_rewrite_parent = true;
+
+       /* other case: btree node being freed */
+       if (iter.nodes[b->level] != b) {
+               /* node has been freed: */
+               BUG_ON(btree_node_hashed(b));
+               mutex_unlock(&c->btree_interior_update_lock);
+               goto err;
+       }
+
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+       if (ret)
+               goto err;
+
+       as = bch2_btree_interior_update_alloc(c);
+
+       if (must_rewrite_parent)
+               as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
+
+       bch2_btree_interior_update_add_node_reference(c, as, b);
+
+       if (new_hash) {
+               bkey_copy(&new_hash->key, &new_key->k_i);
+               BUG_ON(bch2_btree_node_hash_insert(c, new_hash,
+                                       b->level, b->btree_id));
+       }
+
+       parent = iter.nodes[b->level + 1];
+       if (parent) {
+               bch2_btree_insert_node(parent, &iter,
+                                      &keylist_single(&b->key),
+                                      reserve, as);
+       } else {
+               bch2_btree_set_root(&iter, b, as, reserve);
+       }
+
+       if (new_hash) {
+               mutex_lock(&c->btree_cache_lock);
+               bch2_btree_node_hash_remove(c, b);
+
+               bkey_copy(&b->key, &new_key->k_i);
+               __bch2_btree_node_hash_insert(c, b);
+
+               bch2_btree_node_hash_remove(c, new_hash);
+               mutex_unlock(&c->btree_cache_lock);
+       } else {
+               bkey_copy(&b->key, &new_key->k_i);
+       }
+err:
+       if (!IS_ERR_OR_NULL(reserve))
+               bch2_btree_reserve_put(c, reserve);
+       if (new_hash) {
+               mutex_lock(&c->btree_cache_lock);
+               list_move(&b->list, &c->btree_cache_freeable);
+               mutex_unlock(&c->btree_cache_lock);
+
+               six_unlock_write(&new_hash->lock);
+               six_unlock_intent(&new_hash->lock);
+       }
+       bch2_btree_iter_unlock(&iter);
+       up_read(&c->gc_lock);
+       return ret;
+}
index b5cfa890ca0d4bb27b149ab8a066ca375e20ac6b..086077f53895bf9850c74c4189a8a348ed1a8969 100644 (file)
@@ -76,6 +76,9 @@ struct btree_interior_update {
                BTREE_INTERIOR_UPDATING_AS,
        } mode;
 
+       unsigned                        flags;
+       struct btree_reserve            *reserve;
+
        /*
         * BTREE_INTERIOR_UPDATING_NODE:
         * The update that made the new nodes visible was a regular update to an
@@ -86,7 +89,6 @@ struct btree_interior_update {
         */
        struct btree                    *b;
        struct list_head                write_blocked_list;
-       struct list_head                reachable_list;
 
        /*
         * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
@@ -117,6 +119,10 @@ struct btree_interior_update {
        struct pending_btree_node_free  pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
        unsigned                        nr_pending;
 
+       /* New nodes, that will be made reachable by this update: */
+       struct btree                    *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+       unsigned                        nr_new_nodes;
+
        /* Only here to reduce stack usage on recursive splits: */
        struct keylist                  parent_keys;
        /*
@@ -127,6 +133,8 @@ struct btree_interior_update {
        u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
+#define BTREE_INTERIOR_UPDATE_MUST_REWRITE     (1 << 0)
+
 #define for_each_pending_btree_node_free(c, as, p)                     \
        list_for_each_entry(as, &c->btree_interior_update_list, list)   \
                for (p = as->pending; p < as->pending + as->nr_pending; p++)
@@ -138,6 +146,7 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *);
 struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *,
                                             struct btree *,
                                             struct bkey_format,
+                                            struct btree_interior_update *,
                                             struct btree_reserve *);
 
 struct btree_interior_update *
@@ -426,6 +435,8 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
                            __le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
+                              struct bkey_i_extent *);
 
 #endif /* _BCACHE_BTREE_INSERT_H */
 
index 74d54ab172a696e4d9d9500f1d623bd879c7d72b..1b0e3da195ed22fce916fecc80c8ae2abcccb6e1 100644 (file)
@@ -153,6 +153,37 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
        return nr_ptrs;
 }
 
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       EBUG_ON(ptr < &e.v->start->ptr ||
+               ptr >= &extent_entry_last(e)->ptr);
+       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+       memmove_u64s_down(ptr, ptr + 1,
+                         (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+       e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       __bch2_extent_drop_ptr(e, ptr);
+       bch2_extent_drop_redundant_crcs(e);
+}
+
+void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx)
+{
+       struct bch_extent_ptr *ptr;
+       unsigned i = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (i++ == idx)
+                       goto found;
+
+       BUG();
+found:
+       bch2_extent_drop_ptr(e, ptr);
+}
+
 /* returns true if equal */
 static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
 {
index 3a9524846e276277b3a08831bf1f3a5c493baab0..3dc06cb20aa06c74528c6f307f93c5b0fab4b48e 100644 (file)
@@ -552,24 +552,9 @@ static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
 void bch2_extent_narrow_crcs(struct bkey_s_extent);
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
 
-/* Doesn't cleanup redundant crcs */
-static inline void __bch2_extent_drop_ptr(struct bkey_s_extent e,
-                                        struct bch_extent_ptr *ptr)
-{
-       EBUG_ON(ptr < &e.v->start->ptr ||
-               ptr >= &extent_entry_last(e)->ptr);
-       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-       memmove_u64s_down(ptr, ptr + 1,
-                         (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
-       e.k->u64s -= sizeof(*ptr) / sizeof(u64);
-}
-
-static inline void bch2_extent_drop_ptr(struct bkey_s_extent e,
-                                      struct bch_extent_ptr *ptr)
-{
-       __bch2_extent_drop_ptr(e, ptr);
-       bch2_extent_drop_redundant_crcs(e);
-}
+void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
index 803611d1894c0ede591510c708a8e92caf9d9d8c..079f958b577b2654ccb7666b2b9b7efb75b90f09 100644 (file)
@@ -871,9 +871,8 @@ static void bch2_writepage_io_free(struct closure *cl)
 {
        struct bch_writepage_io *io = container_of(cl,
                                        struct bch_writepage_io, cl);
-       struct bio *bio = &io->bio.bio;
 
-       bio_put(bio);
+       bio_put(&io->op.op.wbio.bio);
 }
 
 static void bch2_writepage_io_done(struct closure *cl)
@@ -881,7 +880,7 @@ static void bch2_writepage_io_done(struct closure *cl)
        struct bch_writepage_io *io = container_of(cl,
                                        struct bch_writepage_io, cl);
        struct bch_fs *c = io->op.op.c;
-       struct bio *bio = &io->bio.bio;
+       struct bio *bio = &io->op.op.wbio.bio;
        struct bio_vec *bvec;
        unsigned i;
 
@@ -940,11 +939,12 @@ static void bch2_writepage_io_done(struct closure *cl)
 static void bch2_writepage_do_io(struct bch_writepage_state *w)
 {
        struct bch_writepage_io *io = w->io;
+       struct bio *bio = &io->op.op.wbio.bio;
 
        w->io = NULL;
-       atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages);
+       atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
 
-       io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector;
+       io->op.op.pos.offset = bio->bi_iter.bi_sector;
 
        closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
        continue_at(&io->cl, bch2_writepage_io_done, NULL);
@@ -970,13 +970,13 @@ alloc_io:
                w->io = container_of(bio_alloc_bioset(GFP_NOFS,
                                                      BIO_MAX_PAGES,
                                                      bch2_writepage_bioset),
-                                    struct bch_writepage_io, bio.bio);
+                                    struct bch_writepage_io, op.op.wbio.bio);
 
                closure_init(&w->io->cl, NULL);
                w->io->op.ei            = ei;
                w->io->op.sectors_added = 0;
                w->io->op.is_dio        = false;
-               bch2_write_op_init(&w->io->op.op, c, &w->io->bio,
+               bch2_write_op_init(&w->io->op.op, c,
                                  (struct disk_reservation) {
                                        .nr_replicas = c->opts.data_replicas,
                                  },
@@ -987,7 +987,7 @@ alloc_io:
        }
 
        if (w->io->op.op.res.nr_replicas != nr_replicas ||
-           bio_add_page_contig(&w->io->bio.bio, page)) {
+           bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
                bch2_writepage_do_io(w);
                goto alloc_io;
        }
@@ -1038,7 +1038,7 @@ do_io:
        w->io->op.new_i_size = i_size;
 
        if (wbc->sync_mode == WB_SYNC_ALL)
-               w->io->bio.bio.bi_opf |= REQ_SYNC;
+               w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
 
        /* Before unlocking the page, transfer reservation to w->io: */
        old = page_state_cmpxchg(page_state(page), new, {
@@ -1110,7 +1110,7 @@ get_pages:
                done_index = page->index;
 
                if (w.io &&
-                   !bio_can_add_page_contig(&w.io->bio.bio, page))
+                   !bio_can_add_page_contig(&w.io->op.op.wbio.bio, page))
                        bch2_writepage_do_io(&w);
 
                if (!w.io &&
@@ -1495,7 +1495,7 @@ static long __bch2_dio_write_complete(struct dio_write *dio)
        if (dio->iovec && dio->iovec != dio->inline_vecs)
                kfree(dio->iovec);
 
-       bio_put(&dio->bio.bio);
+       bio_put(&dio->iop.op.wbio.bio);
        return ret;
 }
 
@@ -1517,11 +1517,11 @@ static void bch2_dio_write_done(struct dio_write *dio)
        if (dio->iop.op.error)
                dio->error = dio->iop.op.error;
 
-       bio_for_each_segment_all(bv, &dio->bio.bio, i)
+       bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
                put_page(bv->bv_page);
 
        if (dio->iter.count)
-               bio_reset(&dio->bio.bio);
+               bio_reset(&dio->iop.op.wbio.bio);
 }
 
 static void bch2_do_direct_IO_write(struct dio_write *dio)
@@ -1529,7 +1529,7 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
        struct file *file = dio->req->ki_filp;
        struct inode *inode = file->f_inode;
        struct bch_inode_info *ei = to_bch_ei(inode);
-       struct bio *bio = &dio->bio.bio;
+       struct bio *bio = &dio->iop.op.wbio.bio;
        unsigned flags = 0;
        int ret;
 
@@ -1537,8 +1537,6 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
            !dio->c->opts.journal_flush_disabled)
                flags |= BCH_WRITE_FLUSH;
 
-       bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
-
        ret = bio_iov_iter_get_pages(bio, &dio->iter);
        if (ret < 0) {
                /*
@@ -1555,10 +1553,9 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
        dio->iop.sectors_added  = 0;
        dio->iop.is_dio         = true;
        dio->iop.new_i_size     = U64_MAX;
-       bch2_write_op_init(&dio->iop.op, dio->c, &dio->bio,
-                         dio->res,
+       bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
                          foreground_write_point(dio->c, inode->i_ino),
-                         POS(inode->i_ino, bio->bi_iter.bi_sector),
+                         POS(inode->i_ino, (dio->offset + dio->written) >> 9),
                          &ei->journal_seq, flags);
        dio->iop.op.index_update_fn = bchfs_write_index_update;
 
@@ -1619,7 +1616,7 @@ static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req,
        bio = bio_alloc_bioset(GFP_KERNEL,
                               iov_iter_npages(iter, BIO_MAX_PAGES),
                               bch2_dio_write_bioset);
-       dio = container_of(bio, struct dio_write, bio.bio);
+       dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
        dio->req        = req;
        dio->c          = c;
        dio->written    = 0;
index 3fcc1e7dd8259e866135b2e49858462419933883..252a4039f12d0dbc3f2624aff88eedfcbce6fa4a 100644 (file)
@@ -46,16 +46,16 @@ struct bchfs_write_op {
        s64                     sectors_added;
        bool                    is_dio;
        u64                     new_i_size;
+
+       /* must be last: */
        struct bch_write_op     op;
 };
 
 struct bch_writepage_io {
        struct closure          cl;
 
+       /* must be last: */
        struct bchfs_write_op   op;
-
-       /* must come last: */
-       struct bch_write_bio    bio;
 };
 
 extern struct bio_set *bch2_writepage_bioset;
@@ -76,10 +76,8 @@ struct dio_write {
 
        struct mm_struct        *mm;
 
-       struct bchfs_write_op   iop;
-
        /* must be last: */
-       struct bch_write_bio    bio;
+       struct bchfs_write_op   iop;
 };
 
 extern struct bio_set *bch2_dio_write_bioset;
index 201cdfcb2a3c0770c7c00bebb7d0c0445415cd4d..6c9792e8a9912bbe9563f989c5f4b6ce0c12c878 100644 (file)
@@ -1458,7 +1458,7 @@ int __init bch2_vfs_init(void)
                goto err;
 
        bch2_writepage_bioset =
-               bioset_create(4, offsetof(struct bch_writepage_io, bio.bio));
+               bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
        if (!bch2_writepage_bioset)
                goto err;
 
@@ -1466,7 +1466,8 @@ int __init bch2_vfs_init(void)
        if (!bch2_dio_read_bioset)
                goto err;
 
-       bch2_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio));
+       bch2_dio_write_bioset =
+               bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
        if (!bch2_dio_write_bioset)
                goto err;
 
index 54b523d435dd63e9b561fabfe5ace49ddf131272..78cdaa32c0a19272920ff2afb64117769830a6cd 100644 (file)
@@ -92,12 +92,10 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
        const struct bch_extent_ptr *ptr;
        struct bch_write_bio *n;
        struct bch_dev *ca;
+       unsigned ptr_idx = 0;
 
        BUG_ON(c->opts.nochanges);
 
-       wbio->split = false;
-       wbio->c = c;
-
        extent_for_each_ptr(e, ptr) {
                ca = c->devs[ptr->dev];
 
@@ -107,24 +105,26 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
                        n->bio.bi_end_io        = wbio->bio.bi_end_io;
                        n->bio.bi_private       = wbio->bio.bi_private;
-                       n->c                    = c;
-                       n->orig                 = &wbio->bio;
-                       n->bounce               = false;
+                       n->parent               = wbio;
                        n->split                = true;
+                       n->bounce               = false;
                        n->put_bio              = true;
                        n->bio.bi_opf           = wbio->bio.bi_opf;
-                       __bio_inc_remaining(n->orig);
+                       __bio_inc_remaining(&wbio->bio);
                } else {
                        n = wbio;
+                       n->split                = false;
                }
 
-               if (!journal_flushes_device(ca))
-                       n->bio.bi_opf |= REQ_FUA;
-
+               n->c                    = c;
                n->ca                   = ca;
+               n->ptr_idx              = ptr_idx++;
                n->submit_time_us       = local_clock_us();
                n->bio.bi_iter.bi_sector = ptr->offset;
 
+               if (!journal_flushes_device(ca))
+                       n->bio.bi_opf |= REQ_FUA;
+
                if (likely(percpu_ref_tryget(&ca->io_ref))) {
                        n->have_io_ref          = true;
                        n->bio.bi_bdev          = ca->disk_sb.bdev;
@@ -250,10 +250,9 @@ static void bch2_write_index(struct closure *cl)
 static void bch2_write_discard(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
        struct bpos end = op->pos;
 
-       end.offset += bio_sectors(bio);
+       end.offset += bio_sectors(&op->wbio.bio);
 
        op->error = bch2_discard(op->c, op->pos, end, op->version,
                                &op->res, NULL, NULL);
@@ -308,31 +307,28 @@ static void bch2_write_io_error(struct closure *cl)
 
 static void bch2_write_endio(struct bio *bio)
 {
-       struct closure *cl = bio->bi_private;
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_write_bio *wbio = to_wbio(bio);
-       struct bch_fs *c = wbio->c;
-       struct bio *orig = wbio->orig;
-       struct bch_dev *ca = wbio->ca;
+       struct closure *cl              = bio->bi_private;
+       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = wbio->ca;
 
        if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
-                                      "data write"))
+                                       "data write"))
                set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
 
        if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
 
-       if (bio->bi_error && orig)
-               orig->bi_error = bio->bi_error;
-
        if (wbio->bounce)
                bch2_bio_free_pages_pool(c, bio);
 
        if (wbio->put_bio)
                bio_put(bio);
 
-       if (orig)
-               bio_endio(orig);
+       if (parent)
+               bio_endio(&parent->bio);
        else
                closure_put(cl);
 }
@@ -380,11 +376,10 @@ static void init_append_extent(struct bch_write_op *op,
        bch2_keylist_push(&op->insert_keys);
 }
 
-static int bch2_write_extent(struct bch_write_op *op,
-                           struct open_bucket *ob,
-                           struct bio *orig)
+static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
 {
        struct bch_fs *c = op->c;
+       struct bio *orig = &op->wbio.bio;
        struct bio *bio;
        struct bch_write_bio *wbio;
        unsigned key_to_write_offset = op->insert_keys.top_p -
@@ -392,11 +387,13 @@ static int bch2_write_extent(struct bch_write_op *op,
        struct bkey_i *key_to_write;
        unsigned csum_type = op->csum_type;
        unsigned compression_type = op->compression_type;
-       int ret;
+       int ret, more;
 
        /* don't refetch csum type/compression type */
        barrier();
 
+       BUG_ON(!bio_sectors(orig));
+
        /* Need to decompress data? */
        if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
            (crc_uncompressed_size(NULL, &op->crc) != op->size ||
@@ -421,11 +418,8 @@ static int bch2_write_extent(struct bch_write_op *op,
                                   ob);
 
                bio                     = orig;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
-               wbio->put_bio           = false;
-               ret                     = 0;
+               wbio                    = wbio_init(bio);
+               more                    = 0;
        } else if (csum_type != BCH_CSUM_NONE ||
                   compression_type != BCH_COMPRESSION_NONE) {
                /* all units here in bytes */
@@ -439,19 +433,18 @@ static int bch2_write_extent(struct bch_write_op *op,
                bio = bio_alloc_bioset(GFP_NOIO,
                                       DIV_ROUND_UP(output_available, PAGE_SIZE),
                                       &c->bio_write);
+               wbio                    = wbio_init(bio);
+               wbio->bounce            = true;
+               wbio->put_bio           = true;
+               /* copy WRITE_SYNC flag */
+               wbio->bio.bi_opf        = orig->bi_opf;
+
                /*
                 * XXX: can't use mempool for more than
                 * BCH_COMPRESSED_EXTENT_MAX worth of pages
                 */
                bch2_bio_alloc_pages_pool(c, bio, output_available);
 
-               /* copy WRITE_SYNC flag */
-               bio->bi_opf             = orig->bi_opf;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = true;
-               wbio->put_bio           = true;
-
                do {
                        unsigned fragment_compression_type = compression_type;
                        size_t dst_len, src_len;
@@ -504,45 +497,43 @@ static int bch2_write_extent(struct bch_write_op *op,
                        mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
                                     &c->bio_bounce_pages);
 
-               ret = orig->bi_iter.bi_size != 0;
+               more = orig->bi_iter.bi_size != 0;
        } else {
                bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
                                     &c->bio_write);
-
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
+               wbio                    = wbio_init(bio);
                wbio->put_bio           = bio != orig;
 
                init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
                                   compression_type, 0,
                                   (struct bch_csum) { 0 }, csum_type, ob);
 
-               ret = bio != orig;
+               more = bio != orig;
        }
 
+       /* might have done a realloc... */
+
+       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+       ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+                                   BCH_DATA_USER);
+       if (ret)
+               return ret;
+
        bio->bi_end_io  = bch2_write_endio;
        bio->bi_private = &op->cl;
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
        closure_get(bio->bi_private);
 
-       /* might have done a realloc... */
-
-       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
-       bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
-                             BCH_DATA_USER);
-
        bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
-       return ret;
+       return more;
 }
 
 static void __bch2_write(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bch_fs *c = op->c;
-       struct bio *bio = &op->bio->bio;
        unsigned open_bucket_nr = 0;
        struct open_bucket *b;
        int ret;
@@ -550,22 +541,12 @@ static void __bch2_write(struct closure *cl)
        memset(op->open_buckets, 0, sizeof(op->open_buckets));
 
        if (op->flags & BCH_WRITE_DISCARD) {
-               op->flags |= BCH_WRITE_DONE;
                bch2_write_discard(cl);
-               bio_put(bio);
+               op->flags |= BCH_WRITE_DONE;
                continue_at(cl, bch2_write_done, index_update_wq(op));
        }
 
-       /*
-        * Journal writes are marked REQ_PREFLUSH; if the original write was a
-        * flush, it'll wait on the journal write.
-        */
-       bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
        do {
-               EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
-               EBUG_ON(!bio_sectors(bio));
-
                if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
                        continue_at(cl, bch2_write_index, index_update_wq(op));
 
@@ -622,7 +603,7 @@ static void __bch2_write(struct closure *cl)
                       b - c->open_buckets > U8_MAX);
                op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
 
-               ret = bch2_write_extent(op, b, bio);
+               ret = bch2_write_extent(op, b);
 
                bch2_alloc_sectors_done(c, op->wp, b);
 
@@ -703,16 +684,13 @@ void bch2_wake_delayed_writes(unsigned long data)
  * after the data is written it calls bch_journal, and after the keys have been
  * added to the next journal write they're inserted into the btree.
  *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
  * If op->discard is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->bio and op->inode.
  */
 void bch2_write(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
+       struct bio *bio = &op->wbio.bio;
        struct bch_fs *c = op->c;
        u64 inode = op->pos.inode;
 
@@ -742,7 +720,7 @@ void bch2_write(struct closure *cl)
 
                spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
                bch2_ratelimit_increment(&c->foreground_write_pd.rate,
-                                       bio->bi_iter.bi_size);
+                                        bio->bi_iter.bi_size);
 
                delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
 
@@ -776,15 +754,14 @@ void bch2_write(struct closure *cl)
 }
 
 void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                      struct bch_write_bio *bio, struct disk_reservation res,
-                      struct write_point *wp, struct bpos pos,
-                      u64 *journal_seq, unsigned flags)
+                       struct disk_reservation res,
+                       struct write_point *wp, struct bpos pos,
+                       u64 *journal_seq, unsigned flags)
 {
        EBUG_ON(res.sectors && !res.nr_replicas);
 
        op->c           = c;
        op->io_wq       = index_update_wq(op);
-       op->bio         = bio;
        op->written     = 0;
        op->error       = 0;
        op->flags       = flags;
@@ -983,7 +960,7 @@ static void cache_promote_done(struct closure *cl)
        struct cache_promote_op *op =
                container_of(cl, struct cache_promote_op, cl);
 
-       bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
+       bch2_bio_free_pages_pool(op->write.op.c, &op->write.op.wbio.bio);
        kfree(op);
 }
 
@@ -1020,7 +997,7 @@ static void __bch2_read_endio(struct work_struct *work)
                trace_promote(&rbio->bio);
 
                /* we now own pages: */
-               swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
+               swap(promote->write.op.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
                rbio->promote = NULL;
 
                bch2_rbio_done(rbio);
@@ -1112,7 +1089,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
                promote_op = kmalloc(sizeof(*promote_op) +
                                sizeof(struct bio_vec) * pages, GFP_NOIO);
                if (promote_op) {
-                       struct bio *promote_bio = &promote_op->write.wbio.bio;
+                       struct bio *promote_bio = &promote_op->write.op.wbio.bio;
 
                        bio_init(promote_bio,
                                 promote_bio->bi_inline_vecs,
@@ -1204,7 +1181,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
        rbio->bio.bi_end_io     = bch2_read_endio;
 
        if (promote_op) {
-               struct bio *promote_bio = &promote_op->write.wbio.bio;
+               struct bio *promote_bio = &promote_op->write.op.wbio.bio;
 
                promote_bio->bi_iter = rbio->bio.bi_iter;
                memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
@@ -1367,12 +1344,11 @@ void bch2_read_retry_work(struct work_struct *work)
                                           read_retry_work);
        struct bch_read_bio *rbio;
        struct bio *bio;
-       unsigned long flags;
 
        while (1) {
-               spin_lock_irqsave(&c->read_retry_lock, flags);
+               spin_lock_irq(&c->read_retry_lock);
                bio = bio_list_pop(&c->read_retry_list);
-               spin_unlock_irqrestore(&c->read_retry_lock, flags);
+               spin_unlock_irq(&c->read_retry_lock);
 
                if (!bio)
                        break;
index fb6f3005ef96447c095a97ad5aa1e5a8a8ba5eb0..619bf56b91e7778af01a3fe71f52f23b6deca702 100644 (file)
@@ -41,11 +41,18 @@ static inline struct write_point *foreground_write_point(struct bch_fs *c,
 }
 
 void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
-                       struct bch_write_bio *,
                        struct disk_reservation, struct write_point *,
                        struct bpos, u64 *, unsigned);
 void bch2_write(struct closure *);
 
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+       struct bch_write_bio *wbio = to_wbio(bio);
+
+       memset(wbio, 0, offsetof(struct bch_write_bio, bio));
+       return wbio;
+}
+
 struct cache_promote_op;
 
 struct extent_pick_ptr;
index d104cb72d113edd9a4780affaefc6927a0ac3189..3b73bcff4be2fab3d642f0ce48f24df6febd8803 100644 (file)
@@ -66,37 +66,30 @@ struct bch_write_bio {
        struct bch_fs           *c;
        struct bch_dev          *ca;
        union {
-               struct bio      *orig;
-               struct closure  *cl;
+       struct bch_write_bio    *parent;
+       struct closure          *cl;
        };
 
-       unsigned                submit_time_us;
+       u8                      ptr_idx;
+       u8                      replicas_failed;
+       u8                      order;
+
        unsigned                split:1,
                                bounce:1,
                                put_bio:1,
-                               have_io_ref:1;
+                               have_io_ref:1,
+                               used_mempool:1;
 
-       /* Only for btree writes: */
-       unsigned                used_mempool:1;
-       u8                      order;
+       unsigned                submit_time_us;
+       void                    *data;
 
        struct bio              bio;
 };
 
-struct bch_replace_info {
-       struct extent_insert_hook       hook;
-       /* How many insertions succeeded */
-       unsigned                        successes;
-       /* How many insertions failed */
-       unsigned                        failures;
-       BKEY_PADDED(key);
-};
-
 struct bch_write_op {
        struct closure          cl;
-       struct bch_fs   *c;
+       struct bch_fs           *c;
        struct workqueue_struct *io_wq;
-       struct bch_write_bio    *bio;
 
        unsigned                written; /* sectors */
 
@@ -141,6 +134,9 @@ struct bch_write_op {
 
        struct keylist          insert_keys;
        u64                     inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+       /* Must be last: */
+       struct bch_write_bio    wbio;
 };
 
 #endif /* _BCACHE_IO_TYPES_H */
index b0011b43e1e0a3208ba18d5adf9d918cf8fd2a25..bf8c1528dabfe2d537b15d99e663237268179a91 100644 (file)
@@ -53,15 +53,15 @@ static inline u64 journal_pin_seq(struct journal *j,
        return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
 }
 
-static inline void bch2_journal_add_entry(struct journal_buf *buf,
-                                        const void *data, size_t u64s,
-                                        unsigned type, enum btree_id id,
-                                        unsigned level)
+static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
+                                unsigned type, enum btree_id id,
+                                unsigned level,
+                                const void *data, size_t u64s)
 {
        struct jset *jset = buf->data;
 
-       bch2_journal_add_entry_at(buf, data, u64s, type, id, level,
-                                le32_to_cpu(jset->u64s));
+       bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
+                                 type, id, level, data, u64s);
        le32_add_cpu(&jset->u64s, jset_u64s(u64s));
 }
 
@@ -97,8 +97,9 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
                                       enum btree_id id, struct bkey_i *k,
                                       unsigned level)
 {
-       bch2_journal_add_entry(buf, k, k->k.u64s,
-                             JOURNAL_ENTRY_BTREE_ROOT, id, level);
+       bch2_journal_add_entry_noreservation(buf,
+                             JOURNAL_ENTRY_BTREE_ROOT, id, level,
+                             k, k->k.u64s);
 }
 
 static void journal_seq_blacklist_flush(struct journal *j,
@@ -416,13 +417,8 @@ static void journal_entry_null_range(void *start, void *end)
 {
        struct jset_entry *entry;
 
-       for (entry = start; entry != end; entry = vstruct_next(entry)) {
-               entry->u64s     = 0;
-               entry->btree_id = 0;
-               entry->level    = 0;
-               entry->flags    = 0;
-               SET_JOURNAL_ENTRY_TYPE(entry, 0);
-       }
+       for (entry = start; entry != end; entry = vstruct_next(entry))
+               memset(entry, 0, sizeof(*entry));
 }
 
 static int journal_validate_key(struct bch_fs *c, struct jset *j,
@@ -514,7 +510,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
                        break;
                }
 
-               switch (JOURNAL_ENTRY_TYPE(entry)) {
+               switch (entry->type) {
                case JOURNAL_ENTRY_BTREE_KEYS:
                        vstruct_for_each(entry, k) {
                                ret = journal_validate_key(c, j, entry, k,
@@ -555,8 +551,8 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
 
                        break;
                default:
-                       journal_entry_err(c, "invalid journal entry type %llu",
-                                JOURNAL_ENTRY_TYPE(entry));
+                       journal_entry_err(c, "invalid journal entry type %u",
+                                         entry->type);
                        journal_entry_null_range(entry, vstruct_next(entry));
                        break;
                }
@@ -1426,9 +1422,9 @@ void bch2_journal_start(struct bch_fs *c)
         */
        list_for_each_entry(bl, &j->seq_blacklist, list)
                if (!bl->written) {
-                       bch2_journal_add_entry(journal_cur_buf(j), &bl->seq, 1,
+                       bch2_journal_add_entry_noreservation(journal_cur_buf(j),
                                        JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
-                                       0, 0);
+                                       0, 0, &bl->seq, 1);
 
                        journal_pin_add_entry(j,
                                              &fifo_peek_back(&j->pin),
@@ -2083,8 +2079,8 @@ static void journal_write_compact(struct jset *jset)
                if (prev &&
                    i->btree_id == prev->btree_id &&
                    i->level    == prev->level &&
-                   JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
-                   JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
+                   i->type     == prev->type &&
+                   i->type     == JOURNAL_ENTRY_BTREE_KEYS &&
                    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
                        memmove_u64s_down(vstruct_next(prev),
                                          i->_data,
@@ -2238,8 +2234,9 @@ static void journal_write(struct closure *cl)
                closure_return_with_destructor(cl, journal_write_done);
        }
 
-       bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
-                             BCH_DATA_JOURNAL);
+       if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
+                                 BCH_DATA_JOURNAL))
+               goto err;
 
        /*
         * XXX: we really should just disable the entire journal in nochanges
index 88a9bd12447d803a11f91656a87da543c0d15024..d785a0cbc5a51c2370bf5be83444e7ee0db82c0c 100644 (file)
@@ -125,7 +125,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
                                        struct jset_entry *entry, unsigned type)
 {
        while (entry < vstruct_last(jset)) {
-               if (JOURNAL_ENTRY_TYPE(entry) == type)
+               if (entry->type == type)
                        return entry;
 
                entry = vstruct_next(entry);
@@ -187,8 +187,12 @@ static inline void journal_state_inc(union journal_res_state *s)
        s->buf1_count += s->idx == 1;
 }
 
-static inline void bch2_journal_set_has_inode(struct journal_buf *buf, u64 inum)
+static inline void bch2_journal_set_has_inode(struct journal *j,
+                                             struct journal_res *res,
+                                             u64 inum)
 {
+       struct journal_buf *buf = &j->buf[res->idx];
+
        set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
 }
 
@@ -202,40 +206,46 @@ static inline unsigned jset_u64s(unsigned u64s)
 }
 
 static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
-                                           const void *data, size_t u64s,
+                                           unsigned offset,
                                            unsigned type, enum btree_id id,
-                                           unsigned level, unsigned offset)
+                                           unsigned level,
+                                           const void *data, size_t u64s)
 {
        struct jset_entry *entry = vstruct_idx(buf->data, offset);
 
-       entry->u64s = cpu_to_le16(u64s);
+       memset(entry, 0, sizeof(*entry));
+       entry->u64s     = cpu_to_le16(u64s);
        entry->btree_id = id;
-       entry->level = level;
-       entry->flags = 0;
-       SET_JOURNAL_ENTRY_TYPE(entry, type);
+       entry->level    = level;
+       entry->type     = type;
 
        memcpy_u64s(entry->_data, data, u64s);
 }
 
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-                                       enum btree_id id, const struct bkey_i *k)
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+                                         unsigned type, enum btree_id id,
+                                         unsigned level,
+                                         const void *data, unsigned u64s)
 {
        struct journal_buf *buf = &j->buf[res->idx];
-       unsigned actual = jset_u64s(k->k.u64s);
+       unsigned actual = jset_u64s(u64s);
 
        EBUG_ON(!res->ref);
        BUG_ON(actual > res->u64s);
 
-       bch2_journal_set_has_inode(buf, k->k.p.inode);
-
-       bch2_journal_add_entry_at(buf, k, k->k.u64s,
-                                JOURNAL_ENTRY_BTREE_KEYS, id,
-                                0, res->offset);
-
+       bch2_journal_add_entry_at(buf, res->offset, type,
+                                 id, level, data, u64s);
        res->offset     += actual;
        res->u64s       -= actual;
 }
 
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+                                       enum btree_id id, const struct bkey_i *k)
+{
+       bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
+                              id, 0, k, k->k.u64s);
+}
+
 void bch2_journal_buf_put_slowpath(struct journal *, bool);
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
@@ -272,13 +282,10 @@ static inline void bch2_journal_res_put(struct journal *j,
 
        lock_release(&j->res_map, 0, _RET_IP_);
 
-       while (res->u64s) {
-               bch2_journal_add_entry_at(&j->buf[res->idx], NULL, 0,
-                                        JOURNAL_ENTRY_BTREE_KEYS,
-                                        0, 0, res->offset);
-               res->offset     += jset_u64s(0);
-               res->u64s       -= jset_u64s(0);
-       }
+       while (res->u64s)
+               bch2_journal_add_entry(j, res,
+                                      JOURNAL_ENTRY_BTREE_KEYS,
+                                      0, 0, NULL, 0);
 
        bch2_journal_buf_put(j, res->idx, false);
 
index ba0cc0e45c783e75ff202302bf897621987d086a..78f6d3c12d1cfe32a88c2d3711d1b3f72094237e 100644 (file)
@@ -128,9 +128,12 @@ int bch2_move_data_off_device(struct bch_dev *ca)
                        seen_key_count++;
                        continue;
 next:
-                       if (bkey_extent_is_data(k.k))
-                               bch2_check_mark_super(c, bkey_s_c_to_extent(k),
-                                                     BCH_DATA_USER);
+                       if (bkey_extent_is_data(k.k)) {
+                               ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+                                                           BCH_DATA_USER);
+                               if (ret)
+                                       break;
+                       }
                        bch2_btree_iter_advance_pos(&iter);
                        bch2_btree_iter_cond_resched(&iter);
 
@@ -386,9 +389,12 @@ int bch2_flag_data_bad(struct bch_dev *ca)
                 */
                continue;
 advance:
-               if (bkey_extent_is_data(k.k))
-                       bch2_check_mark_super(c, bkey_s_c_to_extent(k),
-                                             BCH_DATA_USER);
+               if (bkey_extent_is_data(k.k)) {
+                       ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+                                                   BCH_DATA_USER);
+                       if (ret)
+                               break;
+               }
                bch2_btree_iter_advance_pos(&iter);
        }
 
index 8c9395ded10cca7fff7b33e519fcf7077b88cc98..8ef1a0b77cfddeeef213e8a6a42bb73b3fe04929 100644 (file)
@@ -155,11 +155,8 @@ void bch2_migrate_write_init(struct bch_fs *c,
            (move_ptr && move_ptr->cached))
                flags |= BCH_WRITE_CACHED;
 
-       bch2_write_op_init(&m->op, c, &m->wbio,
-                         (struct disk_reservation) { 0 },
-                         wp,
-                         bkey_start_pos(k.k),
-                         NULL, flags);
+       bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp,
+                         bkey_start_pos(k.k), NULL, flags);
 
        if (m->move)
                m->op.alloc_reserve = RESERVE_MOVINGGC;
@@ -194,7 +191,7 @@ static void moving_io_destructor(struct closure *cl)
        atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
        wake_up(&ctxt->wait);
 
-       bio_for_each_segment_all(bv, &io->write.wbio.bio, i)
+       bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
                if (bv->bv_page)
                        __free_page(bv->bv_page);
 
@@ -307,9 +304,7 @@ int bch2_data_move(struct bch_fs *c,
                return -ENOMEM;
        }
 
-       migrate_bio_init(io, &io->write.wbio.bio, k.k->size);
-       bio_get(&io->write.wbio.bio);
-       io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+       migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
 
        bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
 
index 548f0f0a8db1eb79c02250a3b54c2357d81f8be6..094eac8b9542cf87986fa25369ccf9015b839092 100644 (file)
@@ -19,7 +19,6 @@ struct migrate_write {
        bool                    move;
        struct bch_extent_ptr   move_ptr;
        struct bch_write_op     op;
-       struct bch_write_bio    wbio;
 };
 
 void bch2_migrate_write_init(struct bch_fs *,
index 1eae0fcb97cb3cc81f522aeaf7dc996765c3ef93..0ddfad314a70fe32d5483ec7f6a55b7735bae0e6 100644 (file)
@@ -783,6 +783,12 @@ out:
 
 /* replica information: */
 
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
+
 static inline struct bch_replicas_entry *
 replicas_entry_next(struct bch_replicas_entry *i)
 {
@@ -794,6 +800,24 @@ replicas_entry_next(struct bch_replicas_entry *i)
             (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
             (_i) = replicas_entry_next(_i))
 
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+                                    unsigned dev)
+{
+       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+                                   unsigned dev)
+{
+       e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+       return (r->entry_size -
+               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
                                        unsigned *nr,
                                        unsigned *bytes,
@@ -879,6 +903,29 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
        return 0;
 }
 
+static void bkey_to_replicas(struct bkey_s_c_extent e,
+                            enum bch_data_types data_type,
+                            struct bch_replicas_cpu_entry *r,
+                            unsigned *max_dev)
+{
+       const struct bch_extent_ptr *ptr;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached) {
+                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+                       replicas_set_dev(r, ptr->dev);
+               }
+}
+
 /*
  * for when gc of replica information is in progress:
  */
@@ -887,14 +934,11 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
                                   struct bkey_s_c_extent e,
                                   enum bch_data_types data_type)
 {
-       const struct bch_extent_ptr *ptr;
-       struct bch_replicas_cpu_entry *new_e;
+       struct bch_replicas_cpu_entry new_e;
        struct bch_replicas_cpu *new;
-       unsigned i, nr, entry_size, max_dev = 0;
+       unsigned i, nr, entry_size, max_dev;
 
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached)
-                       max_dev = max_t(unsigned, max_dev, ptr->dev);
+       bkey_to_replicas(e, data_type, &new_e, &max_dev);
 
        entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
                DIV_ROUND_UP(max_dev + 1, 8);
@@ -914,12 +958,9 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
                       cpu_replicas_entry(gc_r, i),
                       gc_r->entry_size);
 
-       new_e = cpu_replicas_entry(new, nr - 1);
-       new_e->data_type = data_type;
-
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached)
-                       replicas_set_dev(new_e, ptr->dev);
+       memcpy(cpu_replicas_entry(new, nr - 1),
+              &new_e,
+              new->entry_size);
 
        eytzinger0_sort(new->entries,
                        new->nr,
@@ -931,8 +972,38 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
        return 0;
 }
 
-int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
-                                  enum bch_data_types data_type)
+static bool replicas_has_extent(struct bch_replicas_cpu *r,
+                               struct bkey_s_c_extent e,
+                               enum bch_data_types data_type)
+{
+       struct bch_replicas_cpu_entry search;
+       unsigned max_dev;
+
+       bkey_to_replicas(e, data_type, &search, &max_dev);
+
+       return max_dev < replicas_dev_slots(r) &&
+               eytzinger0_find(r->entries, r->nr,
+                               r->entry_size,
+                               memcmp, &search) < r->nr;
+}
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
+                         enum bch_data_types data_type)
+{
+       bool ret;
+
+       rcu_read_lock();
+       ret = replicas_has_extent(rcu_dereference(c->replicas),
+                                 e, data_type);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+                                         struct bkey_s_c_extent e,
+                                         enum bch_data_types data_type)
 {
        struct bch_replicas_cpu *gc_r;
        const struct bch_extent_ptr *ptr;
@@ -996,6 +1067,25 @@ err:
        return ret;
 }
 
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+                         enum bch_data_types data_type)
+{
+       struct bch_replicas_cpu *gc_r;
+       bool marked;
+
+       rcu_read_lock();
+       marked = replicas_has_extent(rcu_dereference(c->replicas),
+                                    e, data_type) &&
+               (!(gc_r = rcu_dereference(c->replicas_gc)) ||
+                replicas_has_extent(gc_r, e, data_type));
+       rcu_read_unlock();
+
+       if (marked)
+               return 0;
+
+       return bch2_check_mark_super_slowpath(c, e, data_type);
+}
+
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
                                              struct bch_dev *dev_to_offline)
 {
index 879fddad940a2dd40b2b5cd86c4221b109e0878f..65dd9fbb5bb625fd01b51963fd9ac60fdb349560 100644 (file)
@@ -121,92 +121,10 @@ const char *bch2_read_super(struct bcache_superblock *,
                           struct bch_opts, const char *);
 void bch2_write_super(struct bch_fs *);
 
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
-                                    unsigned dev)
-{
-       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
-}
-
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
-                                   unsigned dev)
-{
-       e->devs[dev >> 3] |= 1 << (dev & 7);
-}
-
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
-{
-       return (r->entry_size -
-               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
-}
-
-static inline struct bch_replicas_cpu_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-       return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
-                                  enum bch_data_types);
-
-static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
-                                      struct bkey_s_c_extent e,
-                                      enum bch_data_types data_type)
-{
-       const struct bch_extent_ptr *ptr;
-       struct bch_replicas_cpu_entry search = {
-               .data_type = data_type,
-       };
-       unsigned max_dev = 0;
-
-       BUG_ON(!data_type ||
-              data_type == BCH_DATA_SB ||
-              data_type >= BCH_DATA_NR);
-
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached) {
-                       max_dev = max_t(unsigned, max_dev, ptr->dev);
-                       replicas_set_dev(&search, ptr->dev);
-               }
-
-       return max_dev < replicas_dev_slots(r) &&
-               eytzinger0_find(r->entries, r->nr,
-                               r->entry_size,
-                               memcmp, &search) < r->nr;
-}
-
-static inline bool bch2_sb_has_replicas(struct bch_fs *c,
-                                       struct bkey_s_c_extent e,
-                                       enum bch_data_types data_type)
-{
-       bool ret;
-
-       rcu_read_lock();
-       ret = replicas_has_extent(rcu_dereference(c->replicas),
-                                 e, data_type);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline int bch2_check_mark_super(struct bch_fs *c,
-                                       struct bkey_s_c_extent e,
-                                       enum bch_data_types data_type)
-{
-       struct bch_replicas_cpu *gc_r;
-       bool marked;
-
-       rcu_read_lock();
-       marked = replicas_has_extent(rcu_dereference(c->replicas),
-                                    e, data_type) &&
-               (!(gc_r = rcu_dereference(c->replicas_gc)) ||
-                replicas_has_extent(gc_r, e, data_type));
-       rcu_read_unlock();
-
-       if (marked)
-               return 0;
-
-       return bch2_check_mark_super_slowpath(c, e, data_type);
-}
+bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
+                         enum bch_data_types);
+int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
+                         enum bch_data_types);
 
 struct replicas_status {
        struct {
index 692eb417dd478c2d9273f3d11744fc4cf8595ccd..c4cb0b2f62418d3dd93aa9f503536b56477da22b 100644 (file)
@@ -517,10 +517,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        mutex_init(&c->btree_interior_update_lock);
 
        mutex_init(&c->bio_bounce_pages_lock);
+       mutex_init(&c->zlib_workspace_lock);
+
        bio_list_init(&c->read_retry_list);
        spin_lock_init(&c->read_retry_lock);
        INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
-       mutex_init(&c->zlib_workspace_lock);
+
+       bio_list_init(&c->btree_write_error_list);
+       spin_lock_init(&c->btree_write_error_lock);
+       INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
@@ -593,8 +598,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                                   PAGE_SECTORS, 0) ||
            !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
            lg_lock_init(&c->usage_lock) ||
-           mempool_init_page_pool(&c->btree_bounce_pool, 1,
-                                  ilog2(btree_pages(c))) ||
+           mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
            bdi_setup_and_register(&c->bdi, "bcachefs") ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
@@ -1345,11 +1349,13 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
        }
 }
 
-static bool bch2_fs_may_start(struct bch_fs *c, int flags)
+static bool bch2_fs_may_start(struct bch_fs *c)
 {
        struct replicas_status s;
        struct bch_sb_field_members *mi;
-       unsigned i;
+       unsigned i, flags = c->opts.degraded
+               ? BCH_FORCE_IF_DEGRADED
+               : 0;
 
        if (!c->opts.degraded) {
                mutex_lock(&c->sb_lock);
@@ -1773,7 +1779,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
        mutex_unlock(&c->sb_lock);
 
        err = "insufficient devices";
-       if (!bch2_fs_may_start(c, 0))
+       if (!bch2_fs_may_start(c))
                goto err;
 
        if (!c->opts.nostart) {
@@ -1844,7 +1850,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
        }
        mutex_unlock(&c->sb_lock);
 
-       if (!c->opts.nostart && bch2_fs_may_start(c, 0)) {
+       if (!c->opts.nostart && bch2_fs_may_start(c)) {
                err = __bch2_fs_start(c);
                if (err)
                        goto err;
index 906e7a6b9fedd022332f733f27ff49cef39a1dfc..9a958543aa3b1c49129460607670f8db0268f763 100644 (file)
@@ -577,3 +577,17 @@ void sort_cmp_size(void *base, size_t num, size_t size,
                }
        }
 }
+
+void mempool_free_vp(void *element, void *pool_data)
+{
+       size_t size = (size_t) pool_data;
+
+       vpfree(element, size);
+}
+
+void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+       size_t size = (size_t) pool_data;
+
+       return vpmalloc(size, gfp_mask);
+}
index 68d9a861d98513a24922116e5414a124aaebaf54..a9a17d9ed385a268585d4c415b9ddddf914b2cb6 100644 (file)
@@ -79,23 +79,43 @@ do {                                                                        \
        (__builtin_types_compatible_p(typeof(_val), _type) ||           \
         __builtin_types_compatible_p(typeof(_val), const _type))
 
-static inline void kvpfree(void *p, size_t size)
+static inline void vpfree(void *p, size_t size)
 {
-       if (size < PAGE_SIZE)
-               kfree(p);
-       else if (is_vmalloc_addr(p))
+       if (is_vmalloc_addr(p))
                vfree(p);
        else
                free_pages((unsigned long) p, get_order(size));
+}
 
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+       return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+                                        get_order(size)) ?:
+               __vmalloc(size, gfp_mask, PAGE_KERNEL);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+       if (size < PAGE_SIZE)
+               kfree(p);
+       else
+               vpfree(p, size);
 }
 
 static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 {
-       return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
-               :  (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-                                            get_order(size))
-               ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
+       return size < PAGE_SIZE
+               ? kmalloc(size, gfp_mask)
+               : vpmalloc(size, gfp_mask);
+}
+
+void mempool_free_vp(void *element, void *pool_data);
+void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
+
+static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
+{
+       return mempool_init(pool, min_nr, mempool_alloc_vp,
+                           mempool_free_vp, (void *) size);
 }
 
 #define HEAP(type)                                                     \
index 11480f3cefd452169a7bcca04f5b3f7ec5842c3f..898ccb19ec2ce470e64d4524d7456050c78ab20c 100644 (file)
@@ -1,5 +1,6 @@
 
 #include <string.h>
+#include <sys/mman.h>
 
 #include <linux/math64.h>
 #include <linux/printk.h>
@@ -163,6 +164,8 @@ static void sched_init(void)
 {
        struct task_struct *p = malloc(sizeof(*p));
 
+       mlockall(MCL_CURRENT|MCL_FUTURE);
+
        memset(p, 0, sizeof(*p));
 
        p->state        = TASK_RUNNING;