X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=4cc2a4b1319928ab46474150f3e3bd75c5898368;hb=6016d33b801a5fe13e86e5be3abf68ed166c0796;hp=369b100a0a587fa317727e08099703b6eac9addc;hpb=a4eb187a6f0af8041ae2128e6ee82ab7a43cb87c;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 369b100..4cc2a4b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet * @@ -45,7 +46,8 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) switch (k.k->type) { case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -65,7 +67,7 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) static unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded p) { - unsigned i, durability = 0; + unsigned durability = 0; struct bch_dev *ca; if (p.ptr.cached) @@ -76,16 +78,16 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (ca->mi.state != BCH_MEMBER_STATE_FAILED) durability = max_t(unsigned, durability, ca->mi.durability); - for (i = 0; i < p.ec_nr; i++) { + if (p.has_ec) { struct stripe *s = - genradix_ptr(&c->stripes[0], p.idx); + genradix_ptr(&c->stripes[0], p.ec.idx); if (WARN_ON(!s)) - continue; + goto out; durability = max_t(unsigned, durability, s->nr_redundant); } - +out: return durability; } @@ -204,10 +206,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.idx++; if (force_reconstruct_read(c) && - !p.idx && p.ec_nr) + !p.idx && p.has_ec) p.idx++; - if (p.idx >= p.ec_nr + 1) + if (p.idx >= (unsigned) p.has_ec + 1) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -249,6 +251,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + /* extent specific utility code */ const struct bch_extent_ptr * @@ -279,50 +308,32 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group return NULL; } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return ptr; - - return NULL; -} - unsigned bch2_extent_is_compressed(struct bkey_s_c k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned ret = 0; - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) - ret += p.crc.compressed_size; - } - } + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; return ret; } -bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_extent_ptr m, u64 offset) +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev == m.dev && p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == (s64) m.offset - offset) return true; @@ -389,16 +400,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, bch2_csum_type_is_encryption(n.csum_type); } -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; - extent_for_each_crc(e, crc, i) + bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; @@ -414,9 +426,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; @@ -424,7 +436,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { - extent_for_each_crc(extent_i_to_s(e), u, i) + bkey_for_each_crc(&k->k, ptrs, u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { @@ -436,15 +448,17 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, found: BUG_ON(n.compression_type); BUG_ON(n.offset); - BUG_ON(n.live_size != e->k.size); + BUG_ON(n.live_size != k->k.size); restart_narrow_pointers: - extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(k, &p); ret = true; goto restart_narrow_pointers; } @@ -500,43 +514,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) } } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; - struct bch_dev *ca; - - if (ptr->dev >= c->sb.nr_devices || - !c->devs[ptr->dev]) - return "pointer to invalid device"; - - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; - - bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; - - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; - - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; - - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; - - return NULL; -} - -static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -590,39 +569,110 @@ static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -/* Btree ptrs */ +static const char *extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + struct bch_dev *ca; -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (!bch2_dev_exists2(c, ptr->dev)) + return "pointer to invalid device"; + + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!ca) + return "pointer to invalid device"; + + bkey_for_each_ptr(ptrs, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) + return "multiple pointers to same device"; + + if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) + return "offset past end of device"; + + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) + return "offset before first bucket"; + + if (bucket_remainder(ca, ptr->offset) + + size_ondisk > ca->mi.bucket_size) + return "spans multiple buckets"; + + return NULL; +} + +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; const char *reason; + unsigned nonce = UINT_MAX; - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + if (k.k->type == KEY_TYPE_btree_ptr) + size_ondisk = c->opts.btree_node_size; bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (!extent_entry_is_ptr(entry)) + if (k.k->type == KEY_TYPE_btree_ptr && + !extent_entry_is_ptr(entry)) return "has non ptr field"; - } - bkey_for_each_ptr(ptrs, ptr) { - reason = extent_ptr_invalid(c, k, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + reason = extent_ptr_invalid(c, k, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + if (crc.offset + crc.live_size > + crc.uncompressed_size) + return "checksum offset + key size > uncompressed size"; + + size_ondisk = crc.compressed_size; + + if (!bch2_checksum_type_valid(c, crc.csum_type)) + return "invalid checksum type"; + + if (crc.compression_type >= BCH_COMPRESSION_NR) + return "invalid compression type"; + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + return "incorrect nonce"; + } + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } return NULL; } -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +/* Btree ptrs */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -665,55 +715,53 @@ err: void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const char *invalid; - - bkey_ptrs_to_text(out, c, k); - - invalid = bch2_btree_ptr_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); + bch2_bkey_ptrs_to_text(out, c, k); } /* Extents */ -bool __bch2_cut_front(struct bpos where, struct bkey_s k) +void __bch2_cut_front(struct bpos where, struct bkey_s k) { - u64 len = 0; + u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; + return; EBUG_ON(bkey_cmp(where, k.k->p) > 0); - len = k.k->p.offset - where.offset; + sub = where.offset - bkey_start_offset(k.k); - BUG_ON(len > k.k->size); + k.k->size -= sub; - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) + if (!k.k->size) k.k->type = KEY_TYPE_deleted; - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); + + switch (k.k->type) { + case KEY_TYPE_deleted: + case KEY_TYPE_discard: + case KEY_TYPE_error: + case KEY_TYPE_cookie: + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; - extent_for_each_entry(e, entry) { + bkey_extent_entry_for_each(ptrs, entry) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: if (!seen_crc) - entry->ptr.offset += e.k->size - len; + entry->ptr.offset += sub; break; case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += e.k->size - len; + entry->crc32.offset += sub; break; case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += e.k->size - len; + entry->crc64.offset += sub; break; case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += e.k->size - len; + entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: break; @@ -722,11 +770,20 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k) if (extent_entry_is_crc(entry)) seen_crc = true; } - } - k.k->size = len; + break; + } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - return true; + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_reservation: + break; + default: + BUG(); + } } bool bch2_cut_back(struct bpos where, struct bkey *k) @@ -740,8 +797,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) len = where.offset - bkey_start_offset(k); - BUG_ON(len > k->size); - k->p = where; k->size = len; @@ -751,196 +806,160 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) return true; } -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -void bch2_key_resize(struct bkey *k, - unsigned new_size) +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0; -static bool extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - struct bkey_packed tmp; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = src->k; - else if (bch2_bkey_pack_key(&tmp, &src->k, f)) - memcpy_u64s(dst, &tmp, f->key_u64s); - else - return false; + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } - memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); - return true; + return ret; } -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters, + bool overwrite) +{ + int ret = 0; - /* for deleting: */ - struct bkey_i whiteout; - bool update_journal; - bool update_btree; - bool deleting; -}; + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } -static void verify_extent_nonoverlapping(struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); + *end = bpos_min(*end, pos); + ret = 1; + break; + } + } - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); + bch2_trans_iter_put(trans, iter); + break; + } } -#endif -#endif + return ret; } -static void verify_modified_extent(struct btree_iter *iter, - struct bkey_packed *k) -{ - bch2_btree_iter_verify(iter, iter->l[0].b); - bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); -} +#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) +int bch2_extent_atomic_end(struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; - struct bkey_packed *k; - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(l->b, &l->iter, insert); - - node_iter = l->iter; - k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) - return; + struct btree_trans *trans = iter->trans; + struct btree *b; + struct btree_node_iter node_iter; + struct bkey_packed *_k; + unsigned nr_iters = 0; + int ret; - node_iter = l->iter; - k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) - return; - - k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); - bch2_btree_iter_verify(iter, l->b); -} + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; -static void extent_insert_committed(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = s->insert->k; - BKEY_PADDED(k) split; + b = iter->l[0].b; + node_iter = iter->l[0].iter; - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - bkey_copy(&split.k, insert); - if (s->deleting) - split.k.k.type = KEY_TYPE_discard; + *end = bpos_min(insert->k.p, b->key.k.p); - bch2_cut_back(s->committed, &split.k.k); + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, + &nr_iters, EXTENT_ITERS_MAX / 2, false); + if (ret < 0) + return ret; - if (!bkey_cmp(s->committed, iter->pos)) - return; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) + break; - if (s->update_btree) { - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, - bkey_i_to_s_c(&split.k)); + if (bkey_cmp(bkey_start_pos(&insert->k), + bkey_start_pos(k.k)) > 0) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); - EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + ret = count_iters_for_insert(trans, k, offset, end, + &nr_iters, EXTENT_ITERS_MAX, true); + if (ret) + break; - extent_bset_insert(c, iter, &split.k); + bch2_btree_node_iter_advance(&node_iter, b); } - if (s->update_journal) { - bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout); - if (s->deleting) - split.k.k.type = KEY_TYPE_discard; - - bch2_cut_back(s->committed, &split.k.k); - - EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + return ret < 0 ? ret : 0; +} - bch2_btree_journal_key(s->trans, iter, &split.k); - } +int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +{ + struct bpos end; + int ret; - bch2_cut_front(s->committed, insert); + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; - insert->k.needs_whiteout = false; + bch2_cut_back(end, &k->k); + return 0; } -void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { - struct btree *b = iter->l[0].b; - - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + struct bpos end; + int ret; - bch2_cut_back(b->key.k.p, &k->k); + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; - BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0); + return !bkey_cmp(end, k->k.p); } enum btree_insert_ret -bch2_extent_can_insert(struct btree_insert *trans, +bch2_extent_can_insert(struct btree_trans *trans, struct btree_insert_entry *insert, unsigned *u64s) { @@ -952,9 +971,6 @@ bch2_extent_can_insert(struct btree_insert *trans, struct bkey_s_c k; int sectors; - BUG_ON(trans->flags & BTREE_INSERT_ATOMIC && - !bch2_extent_is_atomic(&insert->k->k, insert->iter)); - /* * We avoid creating whiteouts whenever possible when deleting, but * those optimizations mean we may potentially insert two whiteouts @@ -997,28 +1013,92 @@ bch2_extent_can_insert(struct btree_insert *trans, return BTREE_INSERT_OK; } +static void verify_extent_nonoverlapping(struct bch_fs *c, + struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; + + if (!expensive_debug_checks(c)) + return; + + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); + } +#endif + +#endif +} + +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +} + static void -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, +extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert, struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ bch2_cut_back(bkey_start_pos(&insert->k), k.k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); /* @@ -1029,7 +1109,6 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, bch2_bset_fix_invalidated_key(l->b, _k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { @@ -1046,12 +1125,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, bch2_bset_delete(l->b, _k, _k->u64s); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - bch2_btree_iter_verify(iter, l->b); } else { extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); } break; @@ -1081,7 +1157,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); extent_bset_insert(c, iter, &split.k); break; @@ -1089,34 +1165,82 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, } } -static void __bch2_insert_fixup_extent(struct extent_insert_state *s) +/** + * bch_extent_insert_fixup - insert a new extent and deal with overlaps + * + * this may result in not actually doing the insert, or inserting some subset + * of the insert key. For cmpxchg operations this is where that logic lives. + * + * All subsets of @insert that need to be inserted are inserted using + * bch2_btree_insert_and_journal(). If @b or @res fills up, this function + * returns false, setting @iter->pos for the prefix of @insert that actually got + * inserted. + * + * BSET INVARIANTS: this function is responsible for maintaining all the + * invariants for bsets of extents in memory. things get really hairy with 0 + * size extents + * + * within one bset: + * + * bkey_start_pos(bkey_next(k)) >= k + * or bkey_start_offset(bkey_next(k)) >= k->offset + * + * i.e. strict ordering, no overlapping extents. + * + * multiple bsets (i.e. full btree node): + * + * ∀ k, j + * k.size != 0 ∧ j.size != 0 → + * ¬ (k > bkey_start_pos(j) ∧ k < j) + * + * i.e. no two overlapping keys _of nonzero size_ + * + * We can't realistically maintain this invariant for zero size keys because of + * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j + * there may be another 0 size key between them in another bset, and it will + * thus overlap with the merged key. + * + * In addition, the end of iter->pos indicates how much has been processed. + * If the end of iter->pos is not the same as the end of insert, then + * key insertion needs to continue/be retried. + */ +void bch2_insert_fixup_extent(struct btree_trans *trans, + struct btree_insert_entry *insert_entry) { - struct btree_iter *iter = s->insert->iter; + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert_entry->iter; + struct bkey_i *insert = insert_entry->k; struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + bool deleting = bkey_whiteout(&insert->k); + bool update_journal = !deleting; + bool update_btree = !deleting; + struct bkey_i whiteout = *insert; struct bkey_packed *_k; struct bkey unpacked; - struct bkey_i *insert = s->insert->k; + BKEY_PADDED(k) tmp; - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + EBUG_ON(iter->level); + EBUG_ON(!insert->k.size); + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, KEY_TYPE_discard))) { struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); - - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; - s->committed = bpos_min(s->insert->k->k.p, k.k->p); - if (!bkey_whiteout(k.k)) - s->update_journal = true; + update_journal = true; - if (!s->update_journal) { - bch2_cut_front(s->committed, insert); - bch2_cut_front(s->committed, &s->whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + if (!update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); goto next; } @@ -1125,24 +1249,26 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s) * of the key we're deleting, instead of creating and inserting * a new whiteout: */ - if (s->deleting && - !s->update_btree && + if (deleting && + !update_btree && !bkey_cmp(insert->k.p, k.k->p) && !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { if (!bkey_whiteout(k.k)) { btree_account_key_drop(l->b, _k); _k->type = KEY_TYPE_discard; reserve_whiteout(l->b, _k); + bch2_btree_iter_fix_key_modified(iter, + l->b, _k); } break; } if (k.k->needs_whiteout || bkey_written(l->b, _k)) { insert->k.needs_whiteout = true; - s->update_btree = true; + update_btree = true; } - if (s->update_btree && + if (update_btree && overlap == BCH_EXTENT_OVERLAP_ALL && bkey_whiteout(k.k) && k.k->needs_whiteout) { @@ -1150,176 +1276,52 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s) _k->needs_whiteout = false; } - extent_squash(s, insert, _k, k, overlap); + extent_squash(c, iter, insert, _k, k, overlap); - if (!s->update_btree) - bch2_cut_front(s->committed, insert); + if (!update_btree) + bch2_cut_front(cur_end, insert); next: + node_iter = l->iter; + if (overlap == BCH_EXTENT_OVERLAP_FRONT || overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } - if (bkey_cmp(s->committed, insert->k.p) < 0) - s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p); + l->iter = node_iter; + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - { - struct btree_node_iter node_iter = l->iter; + if (update_btree) { + bkey_copy(&tmp.k, insert); - while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, _k, &s->committed) > 0) - l->iter = node_iter; - } -} + if (deleting) + tmp.k.k.type = KEY_TYPE_discard; -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct btree_iter *iter = insert->iter; - struct btree *b = iter->l[0].b; - struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = iter->pos, - - .whiteout = *insert->k, - .update_journal = !bkey_whiteout(&insert->k->k), - .update_btree = !bkey_whiteout(&insert->k->k), - .deleting = bkey_whiteout(&insert->k->k), - }; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k->k.size); + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: - */ - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); + extent_bset_insert(c, iter, &tmp.k); + } - __bch2_insert_fixup_extent(&s); + if (update_journal) { + bkey_copy(&tmp.k, !deleting ? insert : &whiteout); - extent_insert_committed(&s); + if (deleting) + tmp.k.k.type = KEY_TYPE_discard; - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - if (insert->k->k.size) { - /* got to the end of this leaf node */ - BUG_ON(bkey_cmp(iter->pos, b->key.k.p)); - return BTREE_INSERT_NEED_TRAVERSE; + bch2_btree_journal_key(trans, iter, &tmp.k); } - return BTREE_INSERT_OK; + bch2_cut_front(insert->k.p, insert); } const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; - - if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e.s_c, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - } - } - - return NULL; + return bch2_bkey_ptrs_invalid(c, k); } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; @@ -1335,11 +1337,13 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - + if (percpu_down_read_trylock(&c->mark_lock)) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + percpu_up_read(&c->mark_lock); + } /* * If journal replay hasn't finished, we might be seeing keys * that will be overwritten by the time journal replay is done: @@ -1376,105 +1380,115 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const char *invalid; + bch2_bkey_ptrs_to_text(out, c, k); +} - bkey_ptrs_to_text(out, c, k); +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - invalid = bch2_extent_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src) +{ +#define set_common_fields(_dst, _src) \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (extent_entry_type(to_entry(dst))) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } -static void bch2_extent_crc_init(union bch_extent_crc *crc, - struct bch_extent_crc_unpacked new) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - ._compressed_size = _crc.compressed_size - 1, \ - ._uncompressed_size = _crc.uncompressed_size - 1, \ - .offset = _crc.offset + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - common_fields(new), - .csum = *((__le32 *) &new.csum.lo), - }; - return; - } + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc128; + else + BUG(); - if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - common_fields(new), - .nonce = new.nonce, - .csum_lo = new.csum.lo, - .csum_hi = *((__le16 *) &new.csum.hi), - }; - return; - } + bch2_extent_crc_pack(crc, new); - if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - common_fields(new), - .nonce = new.nonce, - .csum = new.csum, - }; - return; - } -#undef common_fields - BUG(); -} + k->k.u64s += extent_entry_u64s(ptrs.end); -void bch2_extent_crc_append(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked new) -{ - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } -static inline void __extent_entry_insert(struct bkey_i_extent *e, +static inline void __extent_entry_insert(struct bkey_i *k, union bch_extent_entry *dst, union bch_extent_entry *new) { - union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - e->k.u64s += extent_entry_u64s(new); + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); memcpy(dst, new, extent_entry_bytes(new)); } -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, +void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { - struct bch_extent_crc_unpacked crc; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); union bch_extent_entry *pos; - unsigned i; - extent_for_each_crc(extent_i_to_s(e), crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = extent_entry_next(pos); goto found; } - bch2_extent_crc_append(e, p->crc); - pos = extent_entry_last(extent_i_to_s(e)); + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ptr)); + __extent_entry_insert(k, pos, to_entry(&p->ptr)); - for (i = 0; i < p->ec_nr; i++) { - p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); } } @@ -1496,22 +1510,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) /* will only happen if all pointers were cached: */ if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_deleted; + k.k->type = KEY_TYPE_discard; - return false; + return bkey_whiteout(k.k); } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && @@ -1522,7 +1536,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { @@ -1533,145 +1547,128 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_i *l, struct bkey_i *r) + struct bkey_s _l, struct bkey_s _r) { - struct bkey_s_extent el = bkey_i_to_s_extent(l); - struct bkey_s_extent er = bkey_i_to_s_extent(r); - union bch_extent_entry *en_l, *en_r; + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_extent r = bkey_s_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; + union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; - if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k)) + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) return BCH_MERGE_NOMERGE; - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - struct bch_dev *ca; + crc_l = bch2_extent_crc_unpack(l.k, NULL); - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - !extent_entry_is_ptr(en_l)) + if (extent_entry_type(en_l) != extent_entry_type(en_r)) return BCH_MERGE_NOMERGE; - lp = &en_l->ptr; - rp = &en_r->ptr; - - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; + struct bch_dev *ca; - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - } + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); - l->k.needs_whiteout |= r->k.needs_whiteout; + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) + return BCH_MERGE_NOMERGE; - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; - } + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - bch2_key_resize(&l->k, l->k.size + r->k.size); + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; - return BCH_MERGE_MERGE; -} + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch2_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->l[0].b; - struct btree_node_iter *node_iter = &iter->l[0].iter; - BKEY_PADDED(k) li, ri; - struct bkey_packed *m = back_merge ? l : r; - struct bkey_i *mi = back_merge ? &li.k : &ri.k; - struct bset_tree *t = bch2_bkey_to_bset(b, m); - enum merge_result ret; + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; - EBUG_ON(bkey_written(b, m)); + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bch2_bkey_unpack(b, &li.k, l); - bch2_bkey_unpack(b, &ri.k, r); + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; - ret = bch2_bkey_merge(c, &li.k, &ri.k); - if (ret == BCH_MERGE_NOMERGE) - return false; + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; - /* - * check if we overlap with deleted extents - would break the sort - * order: - */ - if (back_merge) { - struct bkey_packed *n = bkey_next(m); - - if (n != btree_bkey_last(b, t) && - bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && - bkey_deleted(n)) - return false; - } else if (ret == BCH_MERGE_MERGE) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - if (prev && - bkey_cmp_left_packed_byval(b, prev, - bkey_start_pos(&li.k.k)) > 0) - return false; + break; + default: + return BCH_MERGE_NOMERGE; + } } - if (ret == BCH_MERGE_PARTIAL) { - if (!extent_i_save(b, m, mi)) - return false; + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - } else { - if (!extent_i_save(b, m, &li.k)) - return false; + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + + if (!extent_entry_is_crc(en_l)) + continue; + + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); } - bch2_bset_fix_invalidated_key(b, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - m, m->u64s, m->u64s); - verify_modified_extent(iter, m); + bch2_key_resize(l.k, l.k->size + r.k->size); - return ret == BCH_MERGE_MERGE; + return BCH_MERGE_MERGE; } bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, unsigned nr_replicas) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bpos end = pos; struct bkey_s_c k; bool ret = true; + int err; end.offset += size; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + BTREE_ITER_SLOTS, k, err) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; @@ -1680,7 +1677,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, break; } } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return ret; } @@ -1734,27 +1731,22 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, } enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_i *l, struct bkey_i *r) + struct bkey_s _l, struct bkey_s _r) { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_reservation r = bkey_s_to_reservation(_r); - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) return BCH_MERGE_NOMERGE; - l->k.needs_whiteout |= r->k.needs_whiteout; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, r.s); return BCH_MERGE_PARTIAL; } - bch2_key_resize(&l->k, l->k.size + r->k.size); + bch2_key_resize(l.k, l.k->size + r.k->size); return BCH_MERGE_MERGE; }