From 2ab2ab0f781ae750473763e8a042c900a982d399 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 Nov 2018 20:11:29 -0400 Subject: [PATCH] Update bcachefs sources to b12d1535f3 bcachefs: fix bounds checks in bch2_bio_map() --- .bcachefs_revision | 2 +- cmd_migrate.c | 5 +- libbcachefs.c | 4 +- libbcachefs/alloc_background.c | 16 +- libbcachefs/bcachefs.h | 1 + libbcachefs/bcachefs_format.h | 30 +- libbcachefs/bkey_methods.h | 11 - libbcachefs/bset.c | 102 +++---- libbcachefs/btree_gc.c | 303 ++++++++++---------- libbcachefs/btree_gc.h | 2 - libbcachefs/btree_io.c | 31 +- libbcachefs/btree_io.h | 2 +- libbcachefs/btree_types.h | 5 - libbcachefs/btree_update_interior.c | 41 +-- libbcachefs/buckets.c | 152 ++++++---- libbcachefs/buckets.h | 5 +- libbcachefs/clock.c | 6 +- libbcachefs/debug.c | 6 +- libbcachefs/extents.c | 332 ++++++++++----------- libbcachefs/extents.h | 232 +++++++-------- libbcachefs/extents_types.h | 13 +- libbcachefs/fs-io.c | 8 +- libbcachefs/fs.c | 18 +- libbcachefs/io.c | 42 +-- libbcachefs/io.h | 4 +- libbcachefs/io_types.h | 2 +- libbcachefs/journal.h | 4 - libbcachefs/journal_io.c | 51 ++-- libbcachefs/journal_io.h | 2 + libbcachefs/migrate.c | 6 +- libbcachefs/move.c | 34 +-- libbcachefs/movinggc.c | 4 +- libbcachefs/rebalance.c | 31 +- libbcachefs/replicas.c | 429 ++++++++++++++-------------- libbcachefs/replicas.h | 16 +- libbcachefs/replicas_types.h | 11 + libbcachefs/super_types.h | 12 - libbcachefs/sysfs.c | 12 +- libbcachefs/util.c | 4 +- libbcachefs/util.h | 67 +++-- 40 files changed, 1032 insertions(+), 1026 deletions(-) create mode 100644 libbcachefs/replicas_types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 1408574..4c8c8d1 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d7f6da1d60ec24266301231538ff6f09716537ed +b12d1535f33661c5f11925d9a2debe28be661088 diff --git a/cmd_migrate.c b/cmd_migrate.c index 352f740..9523dbd 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -250,7 +250,6 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, } static char buf[1 << 20] __aligned(PAGE_SIZE); -static const size_t buf_pages = sizeof(buf) / PAGE_SIZE; static void write_data(struct bch_fs *c, struct bch_inode_unpacked *dst_inode, @@ -258,7 +257,7 @@ static void write_data(struct bch_fs *c, { struct { struct bch_write_op op; - struct bio_vec bv[buf_pages]; + struct bio_vec bv[sizeof(buf) / PAGE_SIZE]; } o; struct closure cl; @@ -267,7 +266,7 @@ static void write_data(struct bch_fs *c, closure_init_stack(&cl); - bio_init(&o.op.wbio.bio, o.bv, buf_pages); + bio_init(&o.op.wbio.bio, o.bv, ARRAY_SIZE(o.bv)); o.op.wbio.bio.bi_iter.bi_size = len; bch2_bio_map(&o.op.wbio.bio, buf); diff --git a/libbcachefs.c b/libbcachefs.c index da31861..3ce69d1 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -346,7 +346,7 @@ static unsigned get_dev_has_data(struct bch_sb *sb, unsigned dev) if (replicas) for_each_replicas_entry(replicas, r) - for (i = 0; i < r->nr; i++) + for (i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; @@ -502,7 +502,7 @@ static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f, printf_pad(32, " %s:", bch2_data_types[e->data_type]); putchar('['); - for (i = 0; i < e->nr; i++) { + for (i = 0; i < e->nr_devs; i++) { if (i) putchar(' '); printf("%u", e->devs[i]); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7ba20c8..c3efb43 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -582,7 +582,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) e.nr++; } else { if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + heap_add_or_replace(&ca->alloc_heap, e, + -bucket_alloc_cmp, NULL); e = (struct alloc_heap_entry) { .bucket = b, @@ -595,14 +596,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + heap_add_or_replace(&ca->alloc_heap, e, + -bucket_alloc_cmp, NULL); for (i = 0; i < ca->alloc_heap.used; i++) nr += ca->alloc_heap.data[i].nr; while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp); + heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); } up_read(&ca->bucket_lock); @@ -632,7 +634,7 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) if (bch2_can_invalidate_bucket(ca, b, m)) { struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); if (heap_full(&ca->alloc_heap)) break; } @@ -659,7 +661,7 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca if (bch2_can_invalidate_bucket(ca, b, m)) { struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); if (heap_full(&ca->alloc_heap)) break; } @@ -697,7 +699,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) break; } - heap_resort(&ca->alloc_heap, bucket_alloc_cmp); + heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); for (i = 0; i < ca->alloc_heap.used; i++) nr += ca->alloc_heap.data[i].nr; @@ -718,7 +720,7 @@ static inline long next_alloc_bucket(struct bch_dev *ca) return b; } - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp); + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); } return -1; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 92727cc..6d5c7d6 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -312,6 +312,7 @@ enum bch_time_stats { #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" +#include "replicas_types.h" #include "super_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index f1814f4..cdf392b 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -456,15 +456,19 @@ enum bch_compression_type { BCH_COMPRESSION_NR = 5, }; +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) +#define BCH_EXTENT_ENTRY_MAX 4 + enum bch_extent_entry_type { - BCH_EXTENT_ENTRY_ptr = 0, - BCH_EXTENT_ENTRY_crc32 = 1, - BCH_EXTENT_ENTRY_crc64 = 2, - BCH_EXTENT_ENTRY_crc128 = 3, +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x }; -#define BCH_EXTENT_ENTRY_MAX 4 - /* Compressed/uncompressed size are stored biased by 1: */ struct bch_extent_crc32 { #if defined(__LITTLE_ENDIAN_BITFIELD) @@ -589,10 +593,10 @@ union bch_extent_entry { #else #error edit for your odd byteorder. #endif - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; - struct bch_extent_ptr ptr; + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x }; enum { @@ -1007,9 +1011,9 @@ enum bch_data_type { }; struct bch_replicas_entry { - u8 data_type; - u8 nr; - u8 devs[0]; + __u8 data_type; + __u8 nr_devs; + __u8 devs[0]; }; struct bch_sb_field_replicas { diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index c708f8c..cf7a556 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -18,17 +18,6 @@ static inline enum bkey_type bkey_type(unsigned level, enum btree_id id) return level ? BKEY_TYPE_BTREE : (enum bkey_type) id; } -static inline bool btree_type_has_ptrs(enum bkey_type type) -{ - switch (type) { - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - return true; - default: - return false; - } -} - struct bch_fs; struct btree; struct bkey; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index c8e16de..c631e30 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1689,7 +1689,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); struct btree_node_iter_set *set; struct bset_tree *t; - unsigned end; + unsigned end = 0; bch2_btree_node_iter_verify(iter, b); @@ -1791,7 +1791,7 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, struct bkey_packed *l, *r, *p; struct bkey uk, up; char buf1[200], buf2[200]; - unsigned j; + unsigned j, inorder; if (!size) return 0; @@ -1799,53 +1799,57 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, if (!bset_has_ro_aux_tree(t)) goto out; - j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra); - if (j && - j < t->size && - k == tree_to_bkey(b, t, j)) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed unpacked at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - case BFLOAT_FAILED_PREV: - p = tree_to_prev_bkey(b, t, j); - l = is_power_of_2(j) - ? btree_bkey_first(b, t) - : tree_to_prev_bkey(b, t, j >> ffs(j)); - r = is_power_of_2(j + 1) - ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - up = bkey_unpack_key(b, p); - uk = bkey_unpack_key(b, k); - bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); - bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); - - return scnprintf(buf, size, - " failed prev at depth %u\n" - "\tkey starts at bit %u but first differing bit at %u\n" - "\t%llu:%llu\n" - "\t%llu:%llu\n" - "\t%s\n" - "\t%s\n", - ilog2(j), - bch2_bkey_greatest_differing_bit(b, l, r), - bch2_bkey_greatest_differing_bit(b, p, k), - uk.p.inode, uk.p.offset, - up.p.inode, up.p.offset, - buf1, buf2); - case BFLOAT_FAILED_OVERFLOW: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed overflow at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - } + inorder = bkey_to_cacheline(b, t, k); + if (!inorder || inorder >= t->size) + goto out; + + j = __inorder_to_eytzinger1(inorder, t->size, t->extra); + if (k != tree_to_bkey(b, t, j)) + goto out; + + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED_UNPACKED: + uk = bkey_unpack_key(b, k); + return scnprintf(buf, size, + " failed unpacked at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + case BFLOAT_FAILED_PREV: + p = tree_to_prev_bkey(b, t, j); + l = is_power_of_2(j) + ? btree_bkey_first(b, t) + : tree_to_prev_bkey(b, t, j >> ffs(j)); + r = is_power_of_2(j + 1) + ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + + up = bkey_unpack_key(b, p); + uk = bkey_unpack_key(b, k); + bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); + bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); + + return scnprintf(buf, size, + " failed prev at depth %u\n" + "\tkey starts at bit %u but first differing bit at %u\n" + "\t%llu:%llu\n" + "\t%llu:%llu\n" + "\t%s\n" + "\t%s\n", + ilog2(j), + bch2_bkey_greatest_differing_bit(b, l, r), + bch2_bkey_greatest_differing_bit(b, p, k), + uk.p.inode, uk.p.offset, + up.p.inode, up.p.offset, + buf1, buf2); + case BFLOAT_FAILED_OVERFLOW: + uk = bkey_unpack_key(b, k); + return scnprintf(buf, size, + " failed overflow at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + } out: *buf = '\0'; return 0; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index b0f9bd7..b3c69da 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -17,6 +17,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "journal_io.h" #include "keylist.h" #include "move.h" #include "replicas.h" @@ -31,6 +32,21 @@ #include #include +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); +} + +static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + __gc_pos_set(c, new_pos); +} + +/* range_checks - for validating min/max pos of each btree node: */ + struct range_checks { struct range_level { struct bpos min; @@ -90,6 +106,19 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b, } } +/* marking of btree keys/nodes: */ + +static bool bkey_type_needs_gc(enum bkey_type type) +{ + switch (type) { + case BKEY_TYPE_BTREE: + case BKEY_TYPE_EXTENTS: + return true; + default: + return false; + } +} + u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) { const struct bch_extent_ptr *ptr; @@ -112,39 +141,8 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) return max_stale; } -/* - * For runtime mark and sweep: - */ -static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k, unsigned flags) -{ - struct gc_pos pos = { 0 }; - u8 ret = 0; - - switch (type) { - case BKEY_TYPE_BTREE: - bch2_mark_key(c, k, c->opts.btree_node_size, - BCH_DATA_BTREE, pos, NULL, - 0, flags| - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - break; - case BKEY_TYPE_EXTENTS: - bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL, - 0, flags| - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - ret = bch2_btree_key_recalc_oldest_gen(c, k); - break; - default: - BUG(); - } - - return ret; -} - -int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) +static int bch2_btree_mark_ptrs_initial(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) { enum bch_data_type data_type = type == BKEY_TYPE_BTREE ? BCH_DATA_BTREE : BCH_DATA_USER; @@ -154,10 +152,10 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, k.k->version.lo > journal_cur_seq(&c->journal)); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c, + fsck_err_on(!bch2_bkey_replicas_marked(c, type, k), c, "superblock not marked as containing replicas (type %u)", data_type)) { - ret = bch2_mark_bkey_replicas(c, data_type, k); + ret = bch2_mark_bkey_replicas(c, type, k); if (ret) return ret; } @@ -198,52 +196,87 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, } } - atomic64_set(&c->key_version, - max_t(u64, k.k->version.lo, - atomic64_read(&c->key_version))); - - bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC); + if (k.k->version.lo > atomic64_read(&c->key_version)) + atomic64_set(&c->key_version, k.k->version.lo); fsck_err: return ret; } -static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) +/* + * For runtime mark and sweep: + */ +static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k, bool initial) +{ + struct gc_pos pos = { 0 }; + unsigned flags = + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD| + (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); + int ret = 0; + + switch (type) { + case BKEY_TYPE_BTREE: + case BKEY_TYPE_EXTENTS: + if (initial) { + ret = bch2_btree_mark_ptrs_initial(c, type, k); + if (ret < 0) + return ret; + } + break; + default: + break; + } + + bch2_mark_key(c, type, k, true, k.k->size, + pos, NULL, 0, flags); + + switch (type) { + case BKEY_TYPE_BTREE: + case BKEY_TYPE_EXTENTS: + ret = bch2_btree_key_recalc_oldest_gen(c, k); + break; + default: + break; + } + + return ret; +} + +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, + bool initial) { enum bkey_type type = btree_node_type(b); struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; u8 stale = 0; + int ret; - if (btree_node_has_ptrs(b)) - for_each_btree_node_key_unpack(b, k, &iter, - &unpacked) { - bch2_bkey_debugcheck(c, b, k); - stale = max(stale, bch2_gc_mark_key(c, type, k, 0)); - } + if (!bkey_type_needs_gc(type)) + return 0; - return stale; -} + for_each_btree_node_key_unpack(b, k, &iter, + &unpacked) { + bch2_bkey_debugcheck(c, b, k); -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); -} + ret = bch2_gc_mark_key(c, type, k, initial); + if (ret < 0) + return ret; -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); - __gc_pos_set(c, new_pos); + stale = max_t(u8, stale, ret); + } + + return stale; } -static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) +static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + bool initial) { struct btree_iter iter; struct btree *b; struct range_checks r; - unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1; + unsigned depth = bkey_type_needs_gc(btree_id) ? 0 : 1; unsigned max_stale; int ret = 0; @@ -254,8 +287,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) /* * if expensive_debug_checks is on, run range_checks on all leaf nodes: + * + * and on startup, we have to read every btree node (XXX: only if it was + * an unclean shutdown) */ - if (expensive_debug_checks(c)) + if (initial || expensive_debug_checks(c)) depth = 0; btree_node_range_checks_init(&r, depth); @@ -266,22 +302,24 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) bch2_verify_btree_nr_keys(b); - max_stale = btree_gc_mark_node(c, b); + max_stale = btree_gc_mark_node(c, b, initial); gc_pos_set(c, gc_pos_btree_node(b)); - if (max_stale > 64) - bch2_btree_node_rewrite(c, &iter, - b->data->keys.seq, - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - else if (!btree_gc_rewrite_disabled(c) && - (btree_gc_always_rewrite(c) || max_stale > 16)) - bch2_btree_node_rewrite(c, &iter, - b->data->keys.seq, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); + if (!initial) { + if (max_stale > 64) + bch2_btree_node_rewrite(c, &iter, + b->data->keys.seq, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!btree_gc_rewrite_disabled(c) && + (btree_gc_always_rewrite(c) || max_stale > 16)) + bch2_btree_node_rewrite(c, &iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + } bch2_btree_iter_cond_resched(&iter); } @@ -293,13 +331,47 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) b = c->btree_roots[btree_id].b; if (!btree_node_fake(b)) - bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); + bch2_gc_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&b->key), initial); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); mutex_unlock(&c->btree_root_lock); return 0; } +static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, + bool initial) +{ + unsigned i; + + for (i = 0; i < BTREE_ID_NR; i++) { + enum bkey_type type = bkey_type(0, i); + + int ret = bch2_gc_btree(c, i, initial); + if (ret) + return ret; + + if (journal && bkey_type_needs_gc(type)) { + struct bkey_i *k, *n; + struct jset_entry *j; + struct journal_replay *r; + int ret; + + list_for_each_entry(r, journal, list) + for_each_jset_key(k, n, j, &r->j) { + if (type == bkey_type(j->level, j->btree_id)) { + ret = bch2_gc_mark_key(c, type, + bkey_i_to_s_c(k), initial); + if (ret < 0) + return ret; + } + } + } + } + + return 0; +} + static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, u64 start, u64 end, enum bch_data_type type, @@ -395,10 +467,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - c->opts.btree_node_size, - BCH_DATA_BTREE, pos, - &stats, 0, + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&d->key), + true, 0, + pos, &stats, 0, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); /* @@ -522,6 +594,7 @@ void bch2_gc(struct bch_fs *c) struct bch_dev *ca; u64 start_time = local_clock(); unsigned i; + int ret; /* * Walk _all_ references to buckets, and recompute them: @@ -557,14 +630,11 @@ void bch2_gc(struct bch_fs *c) bch2_mark_superblocks(c); - /* Walk btree: */ - for (i = 0; i < BTREE_ID_NR; i++) { - int ret = bch2_gc_btree(c, i); - if (ret) { - bch_err(c, "btree gc failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); - goto out; - } + ret = bch2_gc_btrees(c, NULL, false); + if (ret) { + bch_err(c, "btree gc failed: %d", ret); + set_bit(BCH_FS_GC_FAILURE, &c->flags); + goto out; } bch2_mark_pending_btree_node_frees(c); @@ -1006,58 +1076,9 @@ int bch2_gc_thread_start(struct bch_fs *c) /* Initial GC computes bucket marks during startup */ -static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) -{ - struct btree_iter iter; - struct btree *b; - struct range_checks r; - int ret = 0; - - btree_node_range_checks_init(&r, 0); - - gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0)); - - if (!c->btree_roots[id].b) - return 0; - - b = c->btree_roots[id].b; - if (!btree_node_fake(b)) - ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&b->key)); - if (ret) - return ret; - - /* - * We have to hit every btree node before starting journal replay, in - * order for the journal seq blacklist machinery to work: - */ - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - btree_node_range_checks(c, b, &r); - - if (btree_node_has_ptrs(b)) { - struct btree_node_iter node_iter; - struct bkey unpacked; - struct bkey_s_c k; - - for_each_btree_node_key_unpack(b, k, &node_iter, - &unpacked) { - ret = bch2_btree_mark_key_initial(c, - btree_node_type(b), k); - if (ret) - goto err; - } - } - - bch2_btree_iter_cond_resched(&iter); - } -err: - return bch2_btree_iter_unlock(&iter) ?: ret; -} - int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { unsigned iter = 0; - enum btree_id id; int ret = 0; down_write(&c->gc_lock); @@ -1066,13 +1087,7 @@ again: bch2_mark_superblocks(c); - for (id = 0; id < BTREE_ID_NR; id++) { - ret = bch2_initial_gc_btree(c, id); - if (ret) - goto err; - } - - ret = bch2_journal_mark(c, journal); + ret = bch2_gc_btrees(c, journal, true); if (ret) goto err; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 214a3fe..f9225af 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -11,8 +11,6 @@ void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); int bch2_initial_gc(struct bch_fs *, struct list_head *); u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); -int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type, - struct bkey_s_c); void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); /* diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index a4f184f..beab463 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -35,7 +35,7 @@ void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, __btree_node_key_to_offset(b, end) }); - __heap_add(iter, n, btree_node_iter_cmp_heap); + __heap_add(iter, n, btree_node_iter_cmp_heap, NULL); } } @@ -48,9 +48,9 @@ void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, EBUG_ON(iter->data->k > iter->data->end); if (iter->data->k == iter->data->end) - heap_del(iter, 0, btree_node_iter_cmp_heap); + heap_del(iter, 0, btree_node_iter_cmp_heap, NULL); else - heap_sift_down(iter, 0, btree_node_iter_cmp_heap); + heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL); } static void verify_no_dups(struct btree *b, @@ -1345,11 +1345,9 @@ static void btree_node_read_work(struct work_struct *work) struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; bool can_retry; - memset(&avoid, 0, sizeof(avoid)); - goto start; while (1) { bch_info(c, "retrying read"); @@ -1372,8 +1370,9 @@ start: percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - __set_bit(rb->pick.ptr.dev, avoid.d); - can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0; + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_btree_pick_ptr(c, b, &failed, &rb->pick) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, b, can_retry)) @@ -1408,7 +1407,7 @@ static void btree_node_read_endio(struct bio *bio) void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bool sync) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; @@ -1425,7 +1424,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); + bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, + btree_bytes(c)), + &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); @@ -1568,9 +1569,9 @@ retry: new_key = bkey_i_to_extent(&tmp.k); e = extent_i_to_s(new_key); - extent_for_each_ptr_backwards(e, ptr) - if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)) - bch2_extent_drop_ptr(e, ptr); + + bch2_extent_drop_ptrs(e, ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_extent_nr_ptrs(e.c)) goto err; @@ -1880,7 +1881,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, trace_btree_write(b, bytes_to_write, sectors_to_write); - wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio), + wbio = container_of(bio_alloc_bioset(GFP_NOIO, + buf_pages(data, sectors_to_write << 9), + &c->btree_bio), struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index ccd4732..48833a9 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -14,7 +14,7 @@ struct btree_read_bio { struct bch_fs *c; u64 start_time; unsigned have_ioref:1; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; }; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e20dd7a..4434915 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -414,11 +414,6 @@ static inline const struct bkey_ops *btree_node_ops(struct btree *b) return &bch2_bkey_ops[btree_node_type(b)]; } -static inline bool btree_node_has_ptrs(struct btree *b) -{ - return btree_type_has_ptrs(btree_node_type(b)); -} - static inline bool btree_node_is_extents(struct btree *b) { return btree_node_type(b) == BKEY_TYPE_EXTENTS; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 6d3fab8..0a9d691 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -210,11 +210,12 @@ found: if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { struct bch_fs_usage tmp = { 0 }; - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - -c->opts.btree_node_size, BCH_DATA_BTREE, b - ? gc_pos_btree_node(b) - : gc_pos_btree_root(as->btree_id), - &tmp, 0, 0); + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&d->key), + false, 0, b + ? gc_pos_btree_node(b) + : gc_pos_btree_root(as->btree_id), + &tmp, 0, 0); /* * Don't apply tmp - pending deletes aren't tracked in * bch_alloc_stats: @@ -289,10 +290,11 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, BUG_ON(!pending->index_update_done); - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - -c->opts.btree_node_size, BCH_DATA_BTREE, - gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0, 0); + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&pending->key), + false, 0, + gc_phase(GC_PHASE_PENDING_DELETE), + &stats, 0, 0); /* * Don't apply stats - pending deletes aren't tracked in * bch_alloc_stats: @@ -550,7 +552,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, goto err_free; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); if (ret) goto err_free; @@ -1091,8 +1093,9 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) __bch2_btree_set_root_inmem(c, b); - bch2_mark_key(c, bkey_i_to_s_c(&b->key), - c->opts.btree_node_size, BCH_DATA_BTREE, + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&b->key), + true, 0, gc_pos_btree_root(b->btree_id), &stats, 0, 0); @@ -1179,9 +1182,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); if (bkey_extent_is_data(&insert->k)) - bch2_mark_key(c, bkey_i_to_s_c(insert), - c->opts.btree_node_size, BCH_DATA_BTREE, - gc_pos_btree_node(b), &stats, 0, 0); + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(insert), + true, 0, + gc_pos_btree_node(b), &stats, 0, 0); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) @@ -1966,8 +1970,9 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_lock_write(b, iter); - bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), - c->opts.btree_node_size, BCH_DATA_BTREE, + bch2_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&new_key->k_i), + true, 0, gc_pos_btree_root(b->btree_id), &stats, 0, 0); bch2_btree_node_free_index(as, NULL, @@ -2062,7 +2067,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, goto err; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE, extent_i_to_s_c(new_key).s_c); if (ret) goto err_free_update; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 052e8af..271c02f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -533,27 +533,12 @@ static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) crc.uncompressed_size)); } -/* - * Checking against gc's position has to be done here, inside the cmpxchg() - * loop, to avoid racing with the start of gc clearing all the marks - GC does - * that with the gc pos seqlock held. - */ -static void bch2_mark_pointer(struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr, - struct bch_extent_crc_unpacked crc, - s64 sectors, enum bch_data_type data_type, - unsigned replicas, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) +static s64 ptr_disk_sectors(struct bkey_s_c_extent e, + struct extent_ptr_decoded p, + s64 sectors) { - struct bucket_mark old, new; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); - s64 uncompressed_sectors = sectors; - u64 v; - if (crc.compression_type) { + if (p.crc.compression_type) { unsigned old_sectors, new_sectors; if (sectors > 0) { @@ -564,23 +549,29 @@ static void bch2_mark_pointer(struct bch_fs *c, new_sectors = e.k->size + sectors; } - sectors = -__disk_sectors(crc, old_sectors) - +__disk_sectors(crc, new_sectors); + sectors = -__disk_sectors(p.crc, old_sectors) + +__disk_sectors(p.crc, new_sectors); } - /* - * fs level usage (which determines free space) is in uncompressed - * sectors, until copygc + compression is sorted out: - * - * note also that we always update @fs_usage, even when we otherwise - * wouldn't do anything because gc is running - this is because the - * caller still needs to account w.r.t. its disk reservation. It is - * caller's responsibility to not apply @fs_usage if gc is in progress. - */ - fs_usage->replicas - [!ptr->cached && replicas ? replicas - 1 : 0].data - [!ptr->cached ? data_type : BCH_DATA_CACHED] += - uncompressed_sectors; + return sectors; +} + +/* + * Checking against gc's position has to be done here, inside the cmpxchg() + * loop, to avoid racing with the start of gc clearing all the marks - GC does + * that with the gc pos seqlock held. + */ +static void bch2_mark_pointer(struct bch_fs *c, + struct bkey_s_c_extent e, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr); + u64 v; if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (journal_seq) @@ -601,14 +592,14 @@ static void bch2_mark_pointer(struct bch_fs *c, * the allocator invalidating a bucket after we've already * checked the gen */ - if (gen_after(new.gen, ptr->gen)) { + if (gen_after(new.gen, p.ptr.gen)) { BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)); - EBUG_ON(!ptr->cached && + EBUG_ON(!p.ptr.cached && test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); return; } - if (!ptr->cached) + if (!p.ptr.cached) checked_add(new.dirty_sectors, sectors); else checked_add(new.cached_sectors, sectors); @@ -639,16 +630,64 @@ static void bch2_mark_pointer(struct bch_fs *c, bucket_became_unavailable(c, old, new)); } -void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type, - struct gc_pos pos, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) +static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, enum bch_data_type data_type, + struct gc_pos pos, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) { unsigned replicas = bch2_extent_nr_dirty_ptrs(k); BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas)); + BUG_ON(!sectors); + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) { + s64 disk_sectors = ptr_disk_sectors(e, p, sectors); + + /* + * fs level usage (which determines free space) is in + * uncompressed sectors, until copygc + compression is + * sorted out: + * + * note also that we always update @fs_usage, even when + * we otherwise wouldn't do anything because gc is + * running - this is because the caller still needs to + * account w.r.t. its disk reservation. It is caller's + * responsibility to not apply @fs_usage if gc is in + * progress. + */ + stats->replicas + [!p.ptr.cached && replicas ? replicas - 1 : 0].data + [!p.ptr.cached ? data_type : BCH_DATA_CACHED] += + sectors; + + bch2_mark_pointer(c, e, p, disk_sectors, data_type, + stats, journal_seq, flags); + } + break; + } + case BCH_RESERVATION: + if (replicas) + stats->replicas[replicas - 1].persistent_reserved += + sectors * replicas; + break; + } +} +void bch2_mark_key(struct bch_fs *c, + enum bkey_type type, struct bkey_s_c k, + bool inserting, s64 sectors, + struct gc_pos pos, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) +{ /* * synchronization w.r.t. GC: * @@ -685,24 +724,19 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, if (!stats) stats = this_cpu_ptr(c->usage_percpu); - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - - BUG_ON(!sectors); - - extent_for_each_ptr_crc(e, ptr, crc) - bch2_mark_pointer(c, e, ptr, crc, sectors, data_type, - replicas, stats, journal_seq, flags); + switch (type) { + case BKEY_TYPE_BTREE: + bch2_mark_extent(c, k, inserting + ? c->opts.btree_node_size + : -c->opts.btree_node_size, + BCH_DATA_BTREE, + pos, stats, journal_seq, flags); break; - } - case BCH_RESERVATION: - if (replicas) - stats->replicas[replicas - 1].persistent_reserved += - sectors * replicas; + case BKEY_TYPE_EXTENTS: + bch2_mark_extent(c, k, sectors, BCH_DATA_USER, + pos, stats, journal_seq, flags); + break; + default: break; } percpu_up_read_preempt_enable(&c->usage_lock); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index ff86d23..d9fe938 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -203,8 +203,9 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, #define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) #define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) -void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, enum bch_data_type, - struct gc_pos, struct bch_fs_usage *, u64, unsigned); +void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c, + bool, s64, struct gc_pos, + struct bch_fs_usage *, u64, unsigned); void bch2_recalc_sectors_available(struct bch_fs *); diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index c67376f..90b10ce 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -21,7 +21,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) if (clock->timers.data[i] == timer) goto out; - BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp)); + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); out: spin_unlock(&clock->timer_lock); } @@ -34,7 +34,7 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) for (i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) { - heap_del(&clock->timers, i, io_timer_cmp); + heap_del(&clock->timers, i, io_timer_cmp, NULL); break; } @@ -127,7 +127,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) - heap_pop(&clock->timers, ret, io_timer_cmp); + heap_pop(&clock->timers, ret, io_timer_cmp, NULL); spin_unlock(&clock->timer_lock); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 71f649b..f69d76e 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -35,7 +35,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct btree *v = c->verify_data; struct btree_node *n_ondisk, *n_sorted, *n_inmemory; struct bset *sorted, *inmemory; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct bch_dev *ca; struct bio *bio; @@ -62,7 +62,9 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) if (!bch2_dev_get_ioref(ca, READ)) return; - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); + bio = bio_alloc_bioset(GFP_NOIO, + buf_pages(n_sorted, btree_bytes(c)), + &c->btree_bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index a4d7e52..6eaa89c 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -88,7 +88,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, memset(&nr, 0, sizeof(nr)); - heap_resort(iter, key_sort_cmp); + heap_resort(iter, key_sort_cmp, NULL); while (!bch2_btree_node_iter_large_end(iter)) { if (!should_drop_next_key(iter, b)) { @@ -101,7 +101,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, } sort_key_next(iter, b, iter->data); - heap_sift_down(iter, 0, key_sort_cmp); + heap_sift_down(iter, 0, key_sort_cmp, NULL); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); @@ -122,20 +122,11 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) return NULL; } -bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) +void bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) { struct bch_extent_ptr *ptr; - bool dropped = false; - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } - - if (dropped) - bch2_extent_drop_redundant_crcs(e); - return dropped; + bch2_extent_drop_ptrs(e, ptr, ptr->dev == dev); } const struct bch_extent_ptr * @@ -231,21 +222,21 @@ unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e) unsigned bch2_extent_is_compressed(struct bkey_s_c k) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; unsigned ret = 0; switch (k.k->type) { case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); + case BCH_EXTENT_CACHED: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc.compression_type != BCH_COMPRESSION_NONE && - crc.compressed_size < crc.live_size) - ret = max_t(unsigned, ret, crc.compressed_size); + extent_for_each_ptr_decode(e, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE && + p.crc.compressed_size < p.crc.live_size) + ret = max_t(unsigned, ret, p.crc.compressed_size); + } } return ret; @@ -254,34 +245,50 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k) bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, struct bch_extent_ptr m, u64 offset) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) - if (ptr->dev == m.dev && - ptr->gen == m.gen && - (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) == + extent_for_each_ptr_decode(e, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == (s64) m.offset - offset) - return ptr; + return true; - return NULL; + return false; } -/* Doesn't cleanup redundant crcs */ -void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e, + struct bch_extent_ptr *ptr) { + union bch_extent_entry *dst; + union bch_extent_entry *src; + EBUG_ON(ptr < &e.v->start->ptr || ptr >= &extent_entry_last(e)->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); -} -void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) -{ - __bch2_extent_drop_ptr(e, ptr); - bch2_extent_drop_redundant_crcs(e); + src = to_entry(ptr + 1); + + if (src != extent_entry_last(e) && + extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) { + dst = to_entry(ptr); + } else { + extent_for_each_entry(e, dst) { + if (dst == to_entry(ptr)) + break; + + if (extent_entry_next(dst) == to_entry(ptr) && + extent_entry_is_crc(dst)) + break; + } + } + + memmove_u64s_down(dst, src, + (u64 *) extent_entry_last(e) - (u64 *) src); + e.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -323,38 +330,38 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, struct bch_extent_crc_unpacked n) { struct bch_extent_crc_unpacked u; - struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; union bch_extent_entry *i; + bool ret = false; /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) + if (!n.csum_type) { extent_for_each_crc(extent_i_to_s(e), u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { n = u; - break; + goto found; } - - if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n)) return false; - + } +found: BUG_ON(n.compression_type); BUG_ON(n.offset); BUG_ON(n.live_size != e->k.size); - bch2_extent_crc_append(e, n); restart_narrow_pointers: - extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u) - if (can_narrow_crc(u, n)) { - ptr->offset += u.offset; - extent_ptr_append(e, *ptr); - __bch2_extent_drop_ptr(extent_i_to_s(e), ptr); + extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + if (can_narrow_crc(p.crc, n)) { + bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(e, &p); + ret = true; goto restart_narrow_pointers; } - bch2_extent_drop_redundant_crcs(extent_i_to_s(e)); - return true; + return ret; } /* returns true if not equal */ @@ -371,87 +378,13 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, bch2_crc_cmp(l.csum, r.csum)); } -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) -{ - union bch_extent_entry *entry = e.v->start; - union bch_extent_crc *crc, *prev = NULL; - struct bch_extent_crc_unpacked u, prev_u = { 0 }; - - while (entry != extent_entry_last(e)) { - union bch_extent_entry *next = extent_entry_next(entry); - size_t crc_u64s = extent_entry_u64s(entry); - - if (!extent_entry_is_crc(entry)) - goto next; - - crc = entry_to_crc(entry); - u = bch2_extent_crc_unpack(e.k, crc); - - if (next == extent_entry_last(e)) { - /* crc entry with no pointers after it: */ - goto drop; - } - - if (extent_entry_is_crc(next)) { - /* no pointers before next crc entry: */ - goto drop; - } - - if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) { - /* identical to previous crc entry: */ - goto drop; - } - - if (!prev && - !u.csum_type && - !u.compression_type) { - /* null crc entry: */ - union bch_extent_entry *e2; - - extent_for_each_entry_from(e, e2, extent_entry_next(entry)) { - if (!extent_entry_is_ptr(e2)) - break; - - e2->ptr.offset += u.offset; - } - goto drop; - } - - prev = crc; - prev_u = u; -next: - entry = next; - continue; -drop: - memmove_u64s_down(crc, next, - (u64 *) extent_entry_last(e) - (u64 *) next); - e.k->u64s -= crc_u64s; - } - - EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); -} - -static bool should_drop_ptr(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) -{ - return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr); -} - static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) { - struct bch_extent_ptr *ptr = &e.v->start->ptr; - bool dropped = false; - - while ((ptr = extent_ptr_next(e, ptr))) - if (should_drop_ptr(c, e.c, ptr)) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } else - ptr++; + struct bch_extent_ptr *ptr; - if (dropped) - bch2_extent_drop_redundant_crcs(e); + bch2_extent_drop_ptrs(e, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); } bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k) @@ -475,6 +408,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); entry = extent_entry_next(entry)) { switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; case BCH_EXTENT_ENTRY_crc32: entry->crc32.csum = swab32(entry->crc32.csum); break; @@ -488,8 +423,6 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) entry->crc128.csum.lo = (__force __le64) swab64((__force u64) entry->crc128.csum.lo); break; - case BCH_EXTENT_ENTRY_ptr: - break; } } break; @@ -586,12 +519,45 @@ out: return out - buf; } -static inline bool dev_latency_better(struct bch_fs *c, - const struct bch_extent_ptr *ptr1, - const struct bch_extent_ptr *ptr2) +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) +{ + struct bch_dev_io_failures *i; + + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; + + return NULL; +} + +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) +{ + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; + } +} + +/* + * returns true if p1 is better than p2: + */ +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev); + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); u64 l2 = atomic64_read(&dev2->cur_latency[READ]); @@ -602,31 +568,29 @@ static inline bool dev_latency_better(struct bch_fs *c, static int extent_pick_read_device(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; struct bch_dev *ca; int ret = 0; - extent_for_each_ptr_crc(e, ptr, crc) { - ca = bch_dev_bkey_exists(c, ptr->dev); + extent_for_each_ptr_decode(e, p, entry) { + ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (ptr->cached && ptr_stale(ca, ptr)) + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) continue; - if (avoid && test_bit(ptr->dev, avoid->d)) + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f && f->nr_failed >= f->nr_retries) continue; - if (ret && !dev_latency_better(c, ptr, &pick->ptr)) + if (ret && !ptr_better(c, p, *pick)) continue; - *pick = (struct extent_pick_ptr) { - .ptr = *ptr, - .crc = crc, - }; - + *pick = p; ret = 1; } @@ -715,7 +679,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, goto err; } - if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) { + if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, @@ -752,11 +716,11 @@ int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, } int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, pick); + failed, pick); } /* Extents */ @@ -908,7 +872,7 @@ static bool extent_i_save(struct btree *b, struct bkey_packed *dst, static inline void extent_sort_sift(struct btree_node_iter_large *iter, struct btree *b, size_t i) { - heap_sift_down(iter, i, extent_sort_cmp); + heap_sift_down(iter, i, extent_sort_cmp, NULL); } static inline void extent_sort_next(struct btree_node_iter_large *iter, @@ -916,7 +880,7 @@ static inline void extent_sort_next(struct btree_node_iter_large *iter, struct btree_node_iter_set *i) { sort_key_next(iter, b, i); - heap_sift_down(iter, i - iter->data, extent_sort_cmp); + heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL); } static void extent_sort_append(struct bch_fs *c, @@ -964,7 +928,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, memset(&nr, 0, sizeof(nr)); - heap_resort(iter, extent_sort_cmp); + heap_resort(iter, extent_sort_cmp, NULL); while (!bch2_btree_node_iter_large_end(iter)) { lk = __btree_node_offset_to_key(b, _l->k); @@ -1076,8 +1040,9 @@ static void bch2_add_sectors(struct extent_insert_state *s, if (!sectors) return; - bch2_mark_key(c, k, sectors, BCH_DATA_USER, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq, 0); + bch2_mark_key(c, BKEY_TYPE_EXTENTS, k, sectors > 0, sectors, + gc_pos_btree_node(b), &s->stats, + s->trans->journal_res.seq, 0); } static void bch2_subtract_sectors(struct extent_insert_state *s, @@ -1748,8 +1713,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, return; } - if (!bkey_extent_is_cached(e.k) && - !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) { + if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, @@ -1853,25 +1817,25 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc, void bch2_extent_crc_append(struct bkey_i_extent *e, struct bch_extent_crc_unpacked new) { - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - BUG_ON(new.compressed_size > new.uncompressed_size); - BUG_ON(new.live_size != e->k.size); - BUG_ON(!new.compressed_size || !new.uncompressed_size); + bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); + __extent_entry_push(e); +} - /* - * Look up the last crc entry, so we can check if we need to add - * another: - */ - extent_for_each_crc(extent_i_to_s(e), crc, i) - ; +void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, + struct extent_ptr_decoded *p) +{ + struct bch_extent_crc_unpacked crc; + union bch_extent_entry *pos; - if (!bch2_crc_unpacked_cmp(crc, new)) - return; + extent_for_each_crc(extent_i_to_s(e), crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) + goto found; - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + bch2_extent_crc_append(e, p->crc); + pos = extent_entry_last(extent_i_to_s(e)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ptr)); } /* @@ -1957,8 +1921,8 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { int ret; @@ -1969,7 +1933,7 @@ int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT: case BCH_EXTENT_CACHED: ret = extent_pick_read_device(c, bkey_s_c_to_extent(k), - avoid, pick); + failed, pick); if (!ret && !bkey_extent_is_cached(k.k)) ret = -EIO; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 66a02f1..e04cb5a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -52,13 +52,14 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct btree *, struct btree_node_iter_large *); +void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *); - + struct bch_io_failures *, + struct extent_ptr_decoded *); int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_devs_mask *, - struct extent_pick_ptr *); + struct bch_io_failures *, + struct extent_ptr_decoded *); void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); @@ -83,7 +84,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); -bool bch2_extent_drop_device(struct bkey_s_extent, unsigned); +void bch2_extent_drop_device(struct bkey_s_extent, unsigned); const struct bch_extent_ptr * bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); const struct bch_extent_ptr * @@ -161,14 +162,11 @@ extent_entry_type(const union bch_extent_entry *e) static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) { switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - return sizeof(struct bch_extent_crc32); - case BCH_EXTENT_ENTRY_crc64: - return sizeof(struct bch_extent_crc64); - case BCH_EXTENT_ENTRY_crc128: - return sizeof(struct bch_extent_crc128); - case BCH_EXTENT_ENTRY_ptr: - return sizeof(struct bch_extent_ptr); +#define x(f, n) \ + case BCH_EXTENT_ENTRY_##f: \ + return sizeof(struct bch_extent_##f); + BCH_EXTENT_ENTRY_TYPES() +#undef x default: BUG(); } @@ -181,12 +179,24 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_ptr: + return true; + default: + return false; + } } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) { - return !extent_entry_is_ptr(e); + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + return true; + default: + return false; + } } union bch_extent_crc { @@ -200,11 +210,13 @@ union bch_extent_crc { #define to_entry(_entry) \ ({ \ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *)); \ + !type_is(_entry, struct bch_extent_ptr *) && \ + !type_is(_entry, struct bch_extent_stripe_ptr *)); \ \ __builtin_choose_expr( \ (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *)), \ + type_is_exact(_entry, const struct bch_extent_ptr *) ||\ + type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ (const union bch_extent_entry *) (_entry), \ (union bch_extent_entry *) (_entry)); \ }) @@ -234,44 +246,6 @@ union bch_extent_crc { /* checksum entries: */ -enum bch_extent_crc_type { - BCH_EXTENT_CRC_NONE, - BCH_EXTENT_CRC32, - BCH_EXTENT_CRC64, - BCH_EXTENT_CRC128, -}; - -static inline enum bch_extent_crc_type -__extent_crc_type(const union bch_extent_crc *crc) -{ - if (!crc) - return BCH_EXTENT_CRC_NONE; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: - return BCH_EXTENT_CRC32; - case BCH_EXTENT_ENTRY_crc64: - return BCH_EXTENT_CRC64; - case BCH_EXTENT_ENTRY_crc128: - return BCH_EXTENT_CRC128; - default: - BUG(); - } -} - -#define extent_crc_type(_crc) \ -({ \ - BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \ - !type_is(_crc, struct bch_extent_crc64 *) && \ - !type_is(_crc, struct bch_extent_crc128 *) && \ - !type_is(_crc, union bch_extent_crc *)); \ - \ - type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \ - : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \ - : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \ - : __extent_crc_type((union bch_extent_crc *) _crc); \ -}) - static inline struct bch_extent_crc_unpacked bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) { @@ -283,14 +257,15 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .offset = _crc.offset, \ .live_size = k->size - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: + if (!crc) return (struct bch_extent_crc_unpacked) { .compressed_size = k->size, .uncompressed_size = k->size, .live_size = k->size, }; - case BCH_EXTENT_CRC32: { + + switch (extent_entry_type(to_entry(crc))) { + case BCH_EXTENT_ENTRY_crc32: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc32), }; @@ -302,7 +277,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) return ret; } - case BCH_EXTENT_CRC64: { + case BCH_EXTENT_ENTRY_crc64: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc64), .nonce = crc->crc64.nonce, @@ -313,7 +288,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) return ret; } - case BCH_EXTENT_CRC128: { + case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc128), .nonce = crc->crc128.nonce, @@ -346,23 +321,25 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) #define extent_for_each_entry(_e, _entry) \ extent_for_each_entry_from(_e, _entry, (_e).v->start) -/* Iterate over crcs only: */ +/* Iterate over pointers only: */ -#define __extent_crc_next(_e, _p) \ +#define extent_ptr_next(_e, _ptr) \ ({ \ - typeof(&(_e).v->start[0]) _entry = _p; \ + typeof(&(_e).v->start[0]) _entry; \ \ - while ((_entry) < extent_entry_last(_e) && \ - !extent_entry_is_crc(_entry)) \ - (_entry) = extent_entry_next(_entry); \ + extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ + if (extent_entry_is_ptr(_entry)) \ + break; \ \ - entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \ + _entry < extent_entry_last(_e) ? entry_to_ptr(_entry) : NULL; \ }) -#define __extent_for_each_crc(_e, _crc) \ - for ((_crc) = __extent_crc_next(_e, (_e).v->start); \ - (_crc); \ - (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) +#define extent_for_each_ptr(_e, _ptr) \ + for ((_ptr) = &(_e).v->start->ptr; \ + ((_ptr) = extent_ptr_next(_e, _ptr)); \ + (_ptr)++) + +/* Iterate over crcs only: */ #define extent_crc_next(_e, _crc, _iter) \ ({ \ @@ -383,69 +360,61 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) /* Iterate over pointers, with crcs: */ -#define extent_ptr_crc_next(_e, _ptr, _crc) \ +static inline struct extent_ptr_decoded +__extent_ptr_decoded_init(const struct bkey *k) +{ + return (struct extent_ptr_decoded) { + .crc = bch2_extent_crc_unpack(k, NULL), + }; +} + +#define EXTENT_ITERATE_EC (1 << 0) + +#define __extent_ptr_next_decode(_e, _ptr, _entry) \ ({ \ __label__ out; \ - typeof(&(_e).v->start[0]) _entry; \ \ - extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ - if (extent_entry_is_crc(_entry)) { \ - (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\ - } else { \ - _ptr = entry_to_ptr(_entry); \ + extent_for_each_entry_from(_e, _entry, _entry) \ + switch (extent_entry_type(_entry)) { \ + case BCH_EXTENT_ENTRY_ptr: \ + (_ptr).ptr = _entry->ptr; \ goto out; \ + case BCH_EXTENT_ENTRY_crc32: \ + case BCH_EXTENT_ENTRY_crc64: \ + case BCH_EXTENT_ENTRY_crc128: \ + (_ptr).crc = bch2_extent_crc_unpack((_e).k, \ + entry_to_crc(_entry)); \ + break; \ } \ \ - _ptr = NULL; \ out: \ - _ptr; \ -}) - -#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ - for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ - (_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \ - (_ptr)++) - -/* Iterate over pointers only, and from a given position: */ - -#define extent_ptr_next(_e, _ptr) \ -({ \ - struct bch_extent_crc_unpacked _crc; \ - \ - extent_ptr_crc_next(_e, _ptr, _crc); \ + _entry < extent_entry_last(_e); \ }) -#define extent_for_each_ptr(_e, _ptr) \ - for ((_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_next(_e, _ptr)); \ - (_ptr)++) - -#define extent_ptr_prev(_e, _ptr) \ -({ \ - typeof(&(_e).v->start->ptr) _p; \ - typeof(&(_e).v->start->ptr) _prev = NULL; \ - \ - extent_for_each_ptr(_e, _p) { \ - if (_p == (_ptr)) \ - break; \ - _prev = _p; \ - } \ - \ - _prev; \ -}) +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + for ((_ptr) = __extent_ptr_decoded_init((_e).k), \ + (_entry) = (_e).v->start; \ + __extent_ptr_next_decode(_e, _ptr, _entry); \ + (_entry) = extent_entry_next(_entry)) -/* - * Use this when you'll be dropping pointers as you iterate. Quadratic, - * unfortunately: - */ -#define extent_for_each_ptr_backwards(_e, _ptr) \ - for ((_ptr) = extent_ptr_prev(_e, NULL); \ - (_ptr); \ - (_ptr) = extent_ptr_prev(_e, _ptr)) +/* Iterate over pointers backwards: */ void bch2_extent_crc_append(struct bkey_i_extent *, struct bch_extent_crc_unpacked); +void bch2_extent_ptr_decoded_append(struct bkey_i_extent *, + struct extent_ptr_decoded *); + +static inline void __extent_entry_insert(struct bkey_i_extent *e, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); + + memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + e->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} static inline void __extent_entry_push(struct bkey_i_extent *e) { @@ -536,10 +505,23 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, struct bch_extent_crc_unpacked); bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); -void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); -void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); +union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent , + struct bch_extent_ptr *); + +#define bch2_extent_drop_ptrs(_e, _ptr, _cond) \ +do { \ + _ptr = &(_e).v->start->ptr; \ + \ + while ((_ptr = extent_ptr_next(e, _ptr))) { \ + if (_cond) { \ + _ptr = (void *) bch2_extent_drop_ptr(_e, _ptr); \ + continue; \ + } \ + \ + (_ptr)++; \ + } \ +} while (0) bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index 76139f9..02c6256 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -18,9 +18,18 @@ struct bch_extent_crc_unpacked { struct bch_csum csum; }; -struct extent_pick_ptr { - struct bch_extent_ptr ptr; +struct extent_ptr_decoded { struct bch_extent_crc_unpacked crc; + struct bch_extent_ptr ptr; +}; + +struct bch_io_failures { + u8 nr; + struct bch_dev_io_failures { + u8 dev; + u8 nr_failed; + u8 nr_retries; + } devs[BCH_REPLICAS_MAX]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 250dd55..986bb7d 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -963,12 +963,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; + struct extent_ptr_decoded p; - extent_for_each_crc(e, crc, i) - want_full_extent |= ((crc.csum_type != 0) | - (crc.compression_type != 0)); + extent_for_each_ptr_decode(e, p, i) + want_full_extent |= ((p.crc.csum_type != 0) | + (p.crc.compression_type != 0)); } readpage_bio_extend(readpages_iter, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index ae87587..1cf7291 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -973,27 +973,27 @@ static int bch2_fill_extent(struct fiemap_extent_info *info, { if (bkey_extent_is_data(&k->k)) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; int ret; - extent_for_each_ptr_crc(e, ptr, crc) { + extent_for_each_ptr_decode(e, p, entry) { int flags2 = 0; - u64 offset = ptr->offset; + u64 offset = p.ptr.offset; - if (crc.compression_type) + if (p.crc.compression_type) flags2 |= FIEMAP_EXTENT_ENCODED; else - offset += crc.offset; + offset += p.crc.offset; if ((offset & (PAGE_SECTORS - 1)) || (e.k->size & (PAGE_SECTORS - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, - bkey_start_offset(e.k) << 9, - offset << 9, - e.k->size << 9, flags|flags2); + bkey_start_offset(e.k) << 9, + offset << 9, + e.k->size << 9, flags|flags2); if (ret) return ret; } diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 021a80d..eceb486 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -310,9 +310,9 @@ static void __bch2_write_index(struct bch_write_op *op) bkey_copy(dst, src); e = bkey_i_to_s_extent(dst); - extent_for_each_ptr_backwards(e, ptr) - if (test_bit(ptr->dev, op->failed.d)) - bch2_extent_drop_ptr(e, ptr); + + bch2_extent_drop_ptrs(e, ptr, + test_bit(ptr->dev, op->failed.d)); if (!bch2_extent_nr_ptrs(e.c)) { ret = -EIO; @@ -320,7 +320,8 @@ static void __bch2_write_index(struct bch_write_op *op) } if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c); + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, + e.s_c); if (ret) goto err; } @@ -1008,7 +1009,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, struct bpos pos, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned rbio_sectors, struct bch_read_bio **rbio) @@ -1089,7 +1090,7 @@ err: static inline struct promote_op *promote_alloc(struct bch_fs *c, struct bvec_iter iter, struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned flags, struct bch_read_bio **rbio, @@ -1183,7 +1184,8 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, + unsigned flags) { struct btree_iter iter; BKEY_PADDED(k) tmp; @@ -1217,7 +1219,7 @@ retry: goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) @@ -1231,7 +1233,7 @@ out: static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { struct btree_iter iter; struct bkey_s_c k; @@ -1254,7 +1256,7 @@ retry: (k.k->p.offset - bvec_iter.bi_sector) << 9); swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); switch (ret) { case READ_RETRY: goto retry; @@ -1290,14 +1292,12 @@ static void bch2_rbio_retry(struct work_struct *work) struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; u64 inode = rbio->pos.inode; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); - memset(&avoid, 0, sizeof(avoid)); - if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ptr.dev, avoid.d); + bch2_mark_io_failure(&failed, &rbio->pick); rbio->bio.bi_status = 0; @@ -1307,9 +1307,9 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); else - bch2_read_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &failed, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1396,7 +1396,7 @@ out: } static bool should_narrow_crcs(struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && @@ -1549,9 +1549,9 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; struct promote_op *promote = NULL; @@ -1559,7 +1559,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; - pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); + pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick); /* hole or reservation - just zero fill: */ if (!pick_ret) @@ -1723,7 +1723,7 @@ noclone: rbio = bch2_rbio_free(rbio); if (ret == READ_RETRY_AVOID) { - __set_bit(pick.ptr.dev, avoid->d); + bch2_mark_io_failure(failed, &pick); ret = READ_RETRY; } diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 1724232..5bd5f84 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -94,10 +94,10 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) struct bch_devs_mask; struct cache_promote_op; -struct extent_pick_ptr; +struct extent_ptr_decoded; int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c, struct bch_devs_mask *, unsigned); + struct bkey_s_c, struct bch_io_failures *, unsigned); void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); enum bch_read_flags { diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index fe5779b..8ec846c 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -54,7 +54,7 @@ struct bch_read_bio { struct bch_devs_list devs_have; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; /* start pos of data we read (may not be pos of data we want) */ struct bpos pos; struct bversion version; diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 6759810..5870392 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -352,10 +352,6 @@ static inline bool journal_flushes_device(struct bch_dev *ca) return true; } -int bch2_journal_mark(struct bch_fs *, struct list_head *); -void bch2_journal_entries_free(struct list_head *); -int bch2_journal_replay(struct bch_fs *, struct list_head *); - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 2f88e24..0cb1bc3 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -429,7 +429,6 @@ static int journal_read_bucket(struct bch_dev *ca, { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio; struct jset *j = NULL; unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), @@ -441,10 +440,14 @@ static int journal_read_bucket(struct bch_dev *ca, while (offset < end) { if (!sectors_read) { -reread: sectors_read = min_t(unsigned, + struct bio *bio; +reread: + sectors_read = min_t(unsigned, end - offset, buf->size >> 9); - bio_reset(bio); + bio = bio_kmalloc(GFP_KERNEL, + buf_pages(buf->data, + sectors_read << 9)); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = offset; bio->bi_iter.bi_size = sectors_read << 9; @@ -452,6 +455,7 @@ reread: sectors_read = min_t(unsigned, bch2_bio_map(bio, buf->data); ret = submit_bio_wait(bio); + bio_put(bio); if (bch2_dev_io_err_on(ret, ca, "journal read from sector %llu", @@ -849,28 +853,6 @@ fsck_err: /* journal replay: */ -int bch2_journal_mark(struct bch_fs *c, struct list_head *list) -{ - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; - int ret; - - list_for_each_entry(r, list, list) - for_each_jset_key(k, n, j, &r->j) { - enum bkey_type type = bkey_type(j->level, j->btree_id); - struct bkey_s_c k_s_c = bkey_i_to_s_c(k); - - if (btree_type_has_ptrs(type)) { - ret = bch2_btree_mark_key_initial(c, type, k_s_c); - if (ret) - return ret; - } - } - - return 0; -} - int bch2_journal_replay(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; @@ -1064,14 +1046,19 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, * entry - that's why we drop pointers to devices <= current free space, * i.e. whichever device was limiting the current journal entry size. */ - extent_for_each_ptr_backwards(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + bch2_extent_drop_ptrs(e, ptr, ({ + ca = bch_dev_bkey_exists(c, ptr->dev); - if (ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors) - __bch2_extent_drop_ptr(e, ptr); - else - ca->journal.sectors_free -= sectors; + ca->mi.state != BCH_MEMBER_STATE_RW || + ca->journal.sectors_free <= sectors; + })); + + extent_for_each_ptr(e, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW || + ca->journal.sectors_free <= sectors); + ca->journal.sectors_free -= sectors; } replicas = bch2_extent_nr_ptrs(e.c); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index e303df9..d0a652c 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -36,6 +36,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_set_seq(struct bch_fs *c, u64, u64); int bch2_journal_read(struct bch_fs *, struct list_head *); +void bch2_journal_entries_free(struct list_head *); +int bch2_journal_replay(struct bch_fs *, struct list_head *); int bch2_journal_entry_sectors(struct journal *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index f5cbf44..c0dfe1c 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -50,7 +50,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) !(ret = btree_iter_err(k))) { if (!bkey_extent_is_data(k.k) || !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k); if (ret) break; bch2_btree_iter_next(&iter); @@ -71,7 +71,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) */ bch2_extent_normalize(c, e.s); - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, bkey_i_to_s_c(&tmp.key)); if (ret) break; @@ -134,7 +134,7 @@ retry: */ bch2_btree_iter_downgrade(&iter); - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); if (ret) goto err; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index e75e6e7..c9495ab 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -67,8 +67,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct bkey_i_extent *insert, *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; - struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; bool did_work = false; int nr; @@ -98,15 +98,12 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); - if (m->data_cmd == DATA_REWRITE) { - ptr = (struct bch_extent_ptr *) - bch2_extent_has_device(extent_i_to_s_c(insert), - m->data_opts.rewrite_dev); - bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); - } + if (m->data_cmd == DATA_REWRITE) + bch2_extent_drop_device(extent_i_to_s(insert), + m->data_opts.rewrite_dev); - extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { + if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -115,8 +112,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) continue; } - bch2_extent_crc_append(insert, crc); - extent_ptr_append(insert, *ptr); + bch2_extent_ptr_decoded_append(insert, &p); did_work = true; } @@ -153,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto next; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, extent_i_to_s_c(insert).s_c); if (ret) break; @@ -379,8 +375,8 @@ static int bch2_move_extent(struct bch_fs *c, struct data_opts data_opts) { struct moving_io *io; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned sectors = e.k->size, pages; int ret = -ENOMEM; @@ -393,8 +389,8 @@ static int bch2_move_extent(struct bch_fs *c, SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ - extent_for_each_ptr_crc(e, ptr, crc) - sectors = max_t(unsigned, sectors, crc.uncompressed_size); + extent_for_each_ptr_decode(e, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); io = kzalloc(sizeof(struct moving_io) + @@ -605,7 +601,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c) for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k); if (ret) break; } @@ -629,7 +625,7 @@ static int bch2_gc_btree_replicas(struct bch_fs *c) for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); bch2_btree_iter_cond_resched(&iter); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 4688656..70318f2 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -160,7 +160,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) .sectors = bucket_sectors_used(m), .offset = bucket_to_sector(ca, b), }; - heap_add_or_replace(h, e, -sectors_used_cmp); + heap_add_or_replace(h, e, -sectors_used_cmp, NULL); } up_read(&ca->bucket_lock); up_read(&c->gc_lock); @@ -169,7 +169,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) sectors_to_move += i->sectors; while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(h, e, -sectors_used_cmp)); + BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); sectors_to_move -= e.sectors; } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 3fbe7b1..85ea4c6 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -17,17 +17,16 @@ #include static inline bool rebalance_ptr_pred(struct bch_fs *c, - const struct bch_extent_ptr *ptr, - struct bch_extent_crc_unpacked crc, + struct extent_ptr_decoded p, struct bch_io_opts *io_opts) { if (io_opts->background_target && - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && - !ptr->cached) + !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && + !p.ptr.cached) return true; if (io_opts->background_compression && - crc.compression_type != + p.crc.compression_type != bch2_compression_opt_to_type[io_opts->background_compression]) return true; @@ -38,8 +37,8 @@ void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; struct bkey_s_c_extent e; if (!bkey_extent_is_data(k.k)) @@ -51,13 +50,13 @@ void bch2_rebalance_add_key(struct bch_fs *c, e = bkey_s_c_to_extent(k); - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + extent_for_each_ptr_decode(e, p, entry) + if (rebalance_ptr_pred(c, p, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (atomic64_add_return(crc.compressed_size, + if (atomic64_add_return(p.crc.compressed_size, &ca->rebalance_work) == - crc.compressed_size) + p.crc.compressed_size) rebalance_wakeup(c); } } @@ -75,16 +74,16 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; /* Make sure we have room to add a new pointer: */ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > BKEY_EXTENT_VAL_U64s_MAX) return DATA_SKIP; - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) + extent_for_each_ptr_decode(e, p, entry) + if (rebalance_ptr_pred(c, p, io_opts)) goto found; return DATA_SKIP; diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 1e94d35..b0cef99 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -3,17 +3,32 @@ #include "replicas.h" #include "super-io.h" +struct bch_replicas_entry_padded { + struct bch_replicas_entry e; + u8 pad[BCH_SB_MEMBERS_MAX]; +}; + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Replicas tracking - in memory: */ +static inline int u8_cmp(u8 l, u8 r) +{ + return (l > r) - (l < r); +} + +static void replicas_entry_sort(struct bch_replicas_entry *e) +{ + bubble_sort(e->devs, e->nr_devs, u8_cmp); +} + #define for_each_cpu_replicas_entry(_r, _i) \ for (_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ _i = (void *) (_i) + (_r)->entry_size) -static inline struct bch_replicas_cpu_entry * +static inline struct bch_replicas_entry * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; @@ -24,84 +39,79 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } -static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) +static int replicas_entry_to_text(struct bch_replicas_entry *e, + char *buf, size_t size) { - return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; -} + char *out = buf, *end = out + size; + unsigned i; -static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - e->devs[dev >> 3] |= 1 << (dev & 7); -} + out += scnprintf(out, end - out, "%u: [", e->data_type); -static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) -{ - return (r->entry_size - - offsetof(struct bch_replicas_cpu_entry, devs)) * 8; + for (i = 0; i < e->nr_devs; i++) + out += scnprintf(out, end - out, + i ? " %u" : "%u", e->devs[i]); + out += scnprintf(out, end - out, "]"); + + return out - buf; } int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r, char *buf, size_t size) { char *out = buf, *end = out + size; - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; bool first = true; - unsigned i; for_each_cpu_replicas_entry(r, e) { - bool first_e = true; - if (!first) out += scnprintf(out, end - out, " "); first = false; - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) { - if (!first_e) - out += scnprintf(out, end - out, " "); - first_e = false; - out += scnprintf(out, end - out, "%u", i); - } - out += scnprintf(out, end - out, "]"); + out += replicas_entry_to_text(e, out, end - out); } return out - buf; } -static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) +static void extent_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) { - const struct bch_extent_ptr *ptr; - unsigned nr = 0; - - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); - - memset(r, 0, sizeof(*r)); - r->data_type = data_type; + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + if (!p.ptr.cached) + r->devs[r->nr_devs++] = p.ptr.dev; + } +} - *max_dev = 0; +static void bkey_to_replicas(enum bkey_type type, + struct bkey_s_c k, + struct bch_replicas_entry *e) +{ + e->nr_devs = 0; + + switch (type) { + case BKEY_TYPE_BTREE: + e->data_type = BCH_DATA_BTREE; + extent_to_replicas(k, e); + break; + case BKEY_TYPE_EXTENTS: + e->data_type = BCH_DATA_USER; + extent_to_replicas(k, e); + break; + default: + break; + } - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - *max_dev = max_t(unsigned, *max_dev, ptr->dev); - replicas_set_dev(r, ptr->dev); - nr++; - } - return nr; + replicas_entry_sort(e); } static inline void devlist_to_replicas(struct bch_devs_list devs, enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) + struct bch_replicas_entry *e) { unsigned i; @@ -109,28 +119,24 @@ static inline void devlist_to_replicas(struct bch_devs_list devs, data_type == BCH_DATA_SB || data_type >= BCH_DATA_NR); - memset(r, 0, sizeof(*r)); - r->data_type = data_type; + e->data_type = data_type; + e->nr_devs = 0; - *max_dev = 0; + for (i = 0; i < devs.nr; i++) + e->devs[e->nr_devs++] = devs.devs[i]; - for (i = 0; i < devs.nr; i++) { - *max_dev = max_t(unsigned, *max_dev, devs.devs[i]); - replicas_set_dev(r, devs.devs[i]); - } + replicas_entry_sort(e); } static struct bch_replicas_cpu * cpu_replicas_add_entry(struct bch_replicas_cpu *old, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) + struct bch_replicas_entry *new_entry) { struct bch_replicas_cpu *new; unsigned i, nr, entry_size; - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - entry_size = max(entry_size, old->entry_size); + entry_size = max_t(unsigned, old->entry_size, + replicas_entry_bytes(new_entry)); nr = old->nr + 1; new = kzalloc(sizeof(struct bch_replicas_cpu) + @@ -144,30 +150,28 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, for (i = 0; i < old->nr; i++) memcpy(cpu_replicas_entry(new, i), cpu_replicas_entry(old, i), - min(new->entry_size, old->entry_size)); + old->entry_size); memcpy(cpu_replicas_entry(new, old->nr), - &new_entry, - new->entry_size); + new_entry, + replicas_entry_bytes(new_entry)); bch2_cpu_replicas_sort(new); return new; } static bool replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_cpu_entry search, - unsigned max_dev) + struct bch_replicas_entry *search) { - return max_dev < replicas_dev_slots(r) && + return replicas_entry_bytes(search) <= r->entry_size && eytzinger0_find(r->entries, r->nr, r->entry_size, - memcmp, &search) < r->nr; + memcmp, search) < r->nr; } noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) + struct bch_replicas_entry *new_entry) { struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL; int ret = -ENOMEM; @@ -176,16 +180,16 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, old_gc = rcu_dereference_protected(c->replicas_gc, lockdep_is_held(&c->sb_lock)); - if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { - new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); + if (old_gc && !replicas_has_entry(old_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(old_gc, new_entry); if (!new_gc) goto err; } old_r = rcu_dereference_protected(c->replicas, lockdep_is_held(&c->sb_lock)); - if (!replicas_has_entry(old_r, new_entry, max_dev)) { - new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); + if (!replicas_has_entry(old_r, new_entry)) { + new_r = cpu_replicas_add_entry(old_r, new_entry); if (!new_r) goto err; @@ -220,47 +224,63 @@ err: return ret; } +static int __bch2_mark_replicas(struct bch_fs *c, + struct bch_replicas_entry *devs) +{ + struct bch_replicas_cpu *r, *gc_r; + bool marked; + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + gc_r = rcu_dereference(c->replicas_gc); + marked = replicas_has_entry(r, devs) && + (!likely(gc_r) || replicas_has_entry(gc_r, devs)); + rcu_read_unlock(); + + return likely(marked) ? 0 + : bch2_mark_replicas_slowpath(c, devs); +} + int bch2_mark_replicas(struct bch_fs *c, enum bch_data_type data_type, struct bch_devs_list devs) { - struct bch_replicas_cpu_entry search; - struct bch_replicas_cpu *r, *gc_r; - unsigned max_dev; - bool marked; + struct bch_replicas_entry_padded search; if (!devs.nr) return 0; - BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + memset(&search, 0, sizeof(search)); - devlist_to_replicas(devs, data_type, &search, &max_dev); + BUG_ON(devs.nr >= BCH_REPLICAS_MAX); - rcu_read_lock(); - r = rcu_dereference(c->replicas); - gc_r = rcu_dereference(c->replicas_gc); - marked = replicas_has_entry(r, search, max_dev) && - (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); - rcu_read_unlock(); + devlist_to_replicas(devs, data_type, &search.e); - return likely(marked) ? 0 - : bch2_mark_replicas_slowpath(c, search, max_dev); + return __bch2_mark_replicas(c, &search.e); } int bch2_mark_bkey_replicas(struct bch_fs *c, - enum bch_data_type data_type, + enum bkey_type type, struct bkey_s_c k) { - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; + struct bch_replicas_entry_padded search; int ret; - for (i = 0; i < cached.nr; i++) - if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i])))) - return ret; + if (type == BKEY_TYPE_EXTENTS) { + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; + + for (i = 0; i < cached.nr; i++) + if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i])))) + return ret; + } + + bkey_to_replicas(type, k, &search.e); - return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); + return search.e.nr_devs + ? __bch2_mark_replicas(c, &search.e) + : 0; } int bch2_replicas_gc_end(struct bch_fs *c, int ret) @@ -303,7 +323,7 @@ err: int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { struct bch_replicas_cpu *dst, *src; - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; lockdep_assert_held(&c->replicas_gc_lock); @@ -338,40 +358,19 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) /* Replicas tracking - superblock: */ -static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, - unsigned *nr, - unsigned *bytes, - unsigned *max_dev) -{ - struct bch_replicas_entry *i; - unsigned j; - - *nr = 0; - *bytes = sizeof(*r); - *max_dev = 0; - - if (!r) - return; - - for_each_replicas_entry(r, i) { - for (j = 0; j < i->nr; j++) - *max_dev = max_t(unsigned, *max_dev, i->devs[j]); - (*nr)++; - } - - *bytes = (void *) i - (void *) r; -} - static struct bch_replicas_cpu * __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) { + struct bch_replicas_entry *e, *dst; struct bch_replicas_cpu *cpu_r; - unsigned i, nr, bytes, max_dev, entry_size; - - bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); + unsigned nr = 0, entry_size = 0; - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); + if (sb_r) + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + nr * entry_size, GFP_NOIO); @@ -381,20 +380,14 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) cpu_r->nr = nr; cpu_r->entry_size = entry_size; - if (nr) { - struct bch_replicas_cpu_entry *dst = - cpu_replicas_entry(cpu_r, 0); - struct bch_replicas_entry *src = sb_r->entries; - - while (dst < cpu_replicas_entry(cpu_r, nr)) { - dst->data_type = src->data_type; - for (i = 0; i < src->nr; i++) - replicas_set_dev(dst, src->devs[i]); + nr = 0; - src = replicas_entry_next(src); - dst = (void *) dst + entry_size; + if (sb_r) + for_each_replicas_entry(sb_r, e) { + dst = cpu_replicas_entry(cpu_r, nr++); + memcpy(dst, e, replicas_entry_bytes(e)); + replicas_entry_sort(dst); } - } bch2_cpu_replicas_sort(cpu_r); return cpu_r; @@ -422,20 +415,16 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *sb_e; - struct bch_replicas_cpu_entry *e; - size_t i, bytes; + struct bch_replicas_entry *dst, *src; + size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); - for_each_cpu_replicas_entry(r, e) { - bytes += sizeof(struct bch_replicas_entry); - for (i = 0; i < r->entry_size - 1; i++) - bytes += hweight8(e->devs[i]); - } + for_each_cpu_replicas_entry(r, src) + bytes += replicas_entry_bytes(src); sb_r = bch2_sb_resize_replicas(&c->disk_sb, - DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); + DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -ENOSPC; @@ -443,22 +432,42 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, vstruct_end(&sb_r->field) - (void *) &sb_r->entries); - sb_e = sb_r->entries; - for_each_cpu_replicas_entry(r, e) { - sb_e->data_type = e->data_type; + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + memcpy(dst, src, replicas_entry_bytes(src)); - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) - sb_e->devs[sb_e->nr++] = i; + dst = replicas_entry_next(dst); - sb_e = replicas_entry_next(sb_e); - - BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); } return 0; } +static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) +{ + unsigned i; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i + 1 < cpu_r->nr; i++) { + struct bch_replicas_entry *l = + cpu_replicas_entry(cpu_r, i); + struct bch_replicas_entry *r = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + + if (!memcmp(l, r, cpu_r->entry_size)) + return "duplicate replicas entry"; + } + + return NULL; +} + static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); @@ -474,15 +483,15 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi goto err; err = "invalid replicas entry: no devices"; - if (!e->nr) + if (!e->nr_devs) goto err; err = "invalid replicas entry: too many devices"; - if (e->nr >= BCH_REPLICAS_MAX) + if (e->nr_devs >= BCH_REPLICAS_MAX) goto err; err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr; i++) + for (i = 0; i < e->nr_devs; i++) if (!bch2_dev_exists(sb, mi, e->devs[i])) goto err; } @@ -492,25 +501,7 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi if (!cpu_r) goto err; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); - - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_cpu_entry *l = - cpu_replicas_entry(cpu_r, i); - struct bch_replicas_cpu_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - - err = "duplicate replicas entry"; - if (!memcmp(l, r, cpu_r->entry_size)) - goto err; - } - - err = NULL; + err = check_dup_replicas_entries(cpu_r); err: kfree(cpu_r); return err; @@ -525,7 +516,6 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t char *out = buf, *end = out + size; struct bch_replicas_entry *e; bool first = true; - unsigned i; if (!r) { out += scnprintf(out, end - out, "(no replicas section found)"); @@ -537,12 +527,7 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t out += scnprintf(out, end - out, " "); first = false; - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < e->nr; i++) - out += scnprintf(out, end - out, - i ? " %u" : "%u", e->devs[i]); - out += scnprintf(out, end - out, "]"); + out += replicas_entry_to_text(e, out, end - out); } return out - buf; @@ -554,45 +539,59 @@ bool bch2_replicas_marked(struct bch_fs *c, enum bch_data_type data_type, struct bch_devs_list devs) { - struct bch_replicas_cpu_entry search; - unsigned max_dev; + struct bch_replicas_entry_padded search; bool ret; if (!devs.nr) return true; - devlist_to_replicas(devs, data_type, &search, &max_dev); + memset(&search, 0, sizeof(search)); + + devlist_to_replicas(devs, data_type, &search.e); rcu_read_lock(); - ret = replicas_has_entry(rcu_dereference(c->replicas), - search, max_dev); + ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e); rcu_read_unlock(); return ret; } bool bch2_bkey_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, + enum bkey_type type, struct bkey_s_c k) { - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; + struct bch_replicas_entry_padded search; + bool ret; - for (i = 0; i < cached.nr; i++) - if (!bch2_replicas_marked(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i]))) - return false; + if (type == BKEY_TYPE_EXTENTS) { + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; - return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k)); + for (i = 0; i < cached.nr; i++) + if (!bch2_replicas_marked(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i]))) + return false; + } + + bkey_to_replicas(type, k, &search.e); + + if (!search.e.nr_devs) + return true; + + rcu_read_lock(); + ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e); + rcu_read_unlock(); + + return ret; } struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { struct bch_sb_field_members *mi; - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; struct bch_replicas_cpu *r; - unsigned i, dev, dev_slots, nr_online, nr_offline; + unsigned i, nr_online, nr_offline; struct replicas_status ret; memset(&ret, 0, sizeof(ret)); @@ -602,9 +601,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, mi = bch2_sb_get_members(c->disk_sb.sb); rcu_read_lock(); - r = rcu_dereference(c->replicas); - dev_slots = replicas_dev_slots(r); for_each_cpu_replicas_entry(r, e) { if (e->data_type >= ARRAY_SIZE(ret.replicas)) @@ -612,13 +609,11 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, nr_online = nr_offline = 0; - for (dev = 0; dev < dev_slots; dev++) { - if (!replicas_test_dev(e, dev)) - continue; - - BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev)); + for (i = 0; i < e->nr_devs; i++) { + BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, + e->devs[i])); - if (test_bit(dev, online_devs.d)) + if (test_bit(e->devs[i], online_devs.d)) nr_online++; else nr_offline++; @@ -677,20 +672,18 @@ unsigned bch2_replicas_online(struct bch_fs *c, bool meta) unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; struct bch_replicas_cpu *r; - unsigned ret = 0; + unsigned i, ret = 0; rcu_read_lock(); r = rcu_dereference(c->replicas); - if (ca->dev_idx >= replicas_dev_slots(r)) - goto out; - for_each_cpu_replicas_entry(r, e) - if (replicas_test_dev(e, ca->dev_idx)) - ret |= 1 << e->data_type; -out: + for (i = 0; i < e->nr_devs; i++) + if (e->devs[i] == ca->dev_idx) + ret |= 1 << e->data_type; + rcu_read_unlock(); return ret; diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 49f114b..640fe5b 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -1,13 +1,15 @@ #ifndef _BCACHEFS_REPLICAS_H #define _BCACHEFS_REPLICAS_H +#include "replicas_types.h" + bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, struct bch_devs_list); -bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type, +bool bch2_bkey_replicas_marked(struct bch_fs *, enum bkey_type, struct bkey_s_c); int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, struct bch_devs_list); -int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type, +int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type, struct bkey_s_c); int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); @@ -33,11 +35,11 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned); /* iterate over superblock replicas - used by userspace tools: */ -static inline struct bch_replicas_entry * -replicas_entry_next(struct bch_replicas_entry *i) -{ - return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; -} +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#define replicas_entry_next(_i) \ + ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) #define for_each_replicas_entry(_r, _i) \ for (_i = (_r)->entries; \ diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h new file mode 100644 index 0000000..3061840 --- /dev/null +++ b/libbcachefs/replicas_types.h @@ -0,0 +1,11 @@ +#ifndef _BCACHEFS_REPLICAS_TYPES_H +#define _BCACHEFS_REPLICAS_TYPES_H + +struct bch_replicas_cpu { + struct rcu_head rcu; + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry entries[]; +}; + +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index ab83ade..ebb238a 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -34,18 +34,6 @@ struct bch_member_cpu { u8 valid; }; -struct bch_replicas_cpu_entry { - u8 data_type; - u8 devs[BCH_SB_MEMBERS_MAX / 8]; -}; - -struct bch_replicas_cpu { - struct rcu_head rcu; - unsigned nr; - unsigned entry_size; - struct bch_replicas_cpu_entry entries[]; -}; - struct bch_disk_group_cpu { bool deleted; u16 parent; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 3038b45..4812692 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -282,19 +282,19 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k) if (k.k->type == BCH_EXTENT) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) { - if (crc.compression_type == BCH_COMPRESSION_NONE) { + extent_for_each_ptr_decode(e, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_NONE) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else { nr_compressed_extents++; compressed_sectors_compressed += - crc.compressed_size; + p.crc.compressed_size; compressed_sectors_uncompressed += - crc.uncompressed_size; + p.crc.uncompressed_size; } /* only looking at the first ptr */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 5cfaed5..4df96ef 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -526,15 +526,17 @@ void bch2_bio_map(struct bio *bio, void *base) BUG_ON(!bio->bi_iter.bi_size); BUG_ON(bio->bi_vcnt); + BUG_ON(!bio->bi_max_vecs); bv->bv_offset = base ? offset_in_page(base) : 0; goto start; for (; size; bio->bi_vcnt++, bv++) { + BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); + bv->bv_offset = 0; start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, size); - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); if (base) { bv->bv_page = is_vmalloc_addr(base) ? vmalloc_to_page(base) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 178bf98..433ba9c 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -83,6 +83,14 @@ struct closure; (__builtin_types_compatible_p(typeof(_val), _type) || \ __builtin_types_compatible_p(typeof(_val), const _type)) +/* Userspace doesn't align allocations as nicely as the kernel allocators: */ +static inline size_t buf_pages(void *p, size_t len) +{ + return DIV_ROUND_UP(len + + ((unsigned long) p & (PAGE_SIZE - 1)), + PAGE_SIZE); +} + static inline void vpfree(void *p, size_t size) { if (is_vmalloc_addr(p)) @@ -137,7 +145,19 @@ do { \ (heap)->data = NULL; \ } while (0) -#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) +#define heap_set_backpointer(h, i, _fn) \ +do { \ + void (*fn)(typeof(h), size_t) = _fn; \ + if (fn) \ + fn(h, i); \ +} while (0) + +#define heap_swap(h, i, j, set_backpointer) \ +do { \ + swap((h)->data[i], (h)->data[j]); \ + heap_set_backpointer(h, i, set_backpointer); \ + heap_set_backpointer(h, j, set_backpointer); \ +} while (0) #define heap_peek(h) \ ({ \ @@ -147,7 +167,7 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) -#define heap_sift_down(h, i, cmp) \ +#define heap_sift_down(h, i, cmp, set_backpointer) \ do { \ size_t _c, _j = i; \ \ @@ -159,72 +179,75 @@ do { \ \ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ break; \ - heap_swap(h, _c, _j); \ + heap_swap(h, _c, _j, set_backpointer); \ } \ } while (0) -#define heap_sift_up(h, i, cmp) \ +#define heap_sift_up(h, i, cmp, set_backpointer) \ do { \ while (i) { \ size_t p = (i - 1) / 2; \ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ break; \ - heap_swap(h, i, p); \ + heap_swap(h, i, p, set_backpointer); \ i = p; \ } \ } while (0) -#define __heap_add(h, d, cmp) \ -do { \ +#define __heap_add(h, d, cmp, set_backpointer) \ +({ \ size_t _i = (h)->used++; \ (h)->data[_i] = d; \ + heap_set_backpointer(h, _i, set_backpointer); \ \ - heap_sift_up(h, _i, cmp); \ -} while (0) + heap_sift_up(h, _i, cmp, set_backpointer); \ + _i; \ +}) -#define heap_add(h, d, cmp) \ +#define heap_add(h, d, cmp, set_backpointer) \ ({ \ bool _r = !heap_full(h); \ if (_r) \ - __heap_add(h, d, cmp); \ + __heap_add(h, d, cmp, set_backpointer); \ _r; \ }) -#define heap_add_or_replace(h, new, cmp) \ +#define heap_add_or_replace(h, new, cmp, set_backpointer) \ do { \ - if (!heap_add(h, new, cmp) && \ + if (!heap_add(h, new, cmp, set_backpointer) && \ cmp(h, new, heap_peek(h)) >= 0) { \ (h)->data[0] = new; \ - heap_sift_down(h, 0, cmp); \ + heap_set_backpointer(h, 0, set_backpointer); \ + heap_sift_down(h, 0, cmp, set_backpointer); \ } \ } while (0) -#define heap_del(h, i, cmp) \ +#define heap_del(h, i, cmp, set_backpointer) \ do { \ size_t _i = (i); \ \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ - heap_swap(h, _i, (h)->used); \ - heap_sift_up(h, _i, cmp); \ - heap_sift_down(h, _i, cmp); \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ } while (0) -#define heap_pop(h, d, cmp) \ +#define heap_pop(h, d, cmp, set_backpointer) \ ({ \ bool _r = (h)->used; \ if (_r) { \ (d) = (h)->data[0]; \ - heap_del(h, 0, cmp); \ + heap_del(h, 0, cmp, set_backpointer); \ } \ _r; \ }) -#define heap_resort(heap, cmp) \ +#define heap_resort(heap, cmp, set_backpointer) \ do { \ ssize_t _i; \ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift_down(heap, _i, cmp); \ + heap_sift_down(heap, _i, cmp, set_backpointer); \ } while (0) #define ANYSINT_MAX(t) \ -- 2.39.2