X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=e2c09ea4a3e013bd9bb55c2518da017e59dd69c7;hb=fa358537725c8065b058b558125cf15359936f94;hp=dc3fbfb6aa7e97ee00c0bb2417ca25700c5266c3;hpb=0c7db4eca3e6519043c10288cb41f8a0ee634a0b;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index dc3fbfb..e2c09ea 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet * @@ -8,12 +9,11 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "debug.h" -#include "dirent.h" #include "disk_groups.h" #include "error.h" #include "extents.h" @@ -23,84 +23,18 @@ #include "super.h" #include "super-io.h" #include "util.h" -#include "xattr.h" #include -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; - - bkey_for_each_ptr(p, ptr) - nr_ptrs++; - - return nr_ptrs; -} - -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) -{ - unsigned nr_ptrs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: { - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - nr_ptrs += !ptr->cached; - BUG_ON(!nr_ptrs); - break; - } - case KEY_TYPE_reservation: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return nr_ptrs; -} - -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) -{ - unsigned i, durability = 0; - struct bch_dev *ca; - - if (p.ptr.cached) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); - - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) - durability = max_t(unsigned, durability, ca->mi.durability); - - for (i = 0; i < p.ec_nr; i++) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.idx); - - if (WARN_ON(!s)) - continue; - - durability = max_t(unsigned, durability, s->nr_redundant); - } - - return durability; -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - return durability; -} +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, unsigned dev) @@ -155,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c, return bch2_rand_range(l1 + l2) > l1; } - if (force_reconstruct_read(c)) + if (bch2_force_reconstruct_read) return p1.idx > p2.idx; return p1.idx < p2.idx; @@ -181,6 +115,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return -EIO; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; + ca = bch_dev_bkey_exists(c, p.ptr.dev); /* @@ -203,11 +144,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, !bch2_dev_is_readable(ca)) p.idx++; - if (force_reconstruct_read(c) && - !p.idx && p.ec_nr) + if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) p.idx++; - if (p.idx >= p.ec_nr + 1) + if (p.idx >= (unsigned) p.has_ec + 1) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -220,186 +161,311 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return ret; } -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + return -BCH_ERR_invalid_bkey; + } - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); + if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { + prt_printf(err, "value too small (%zu <= %zu)", + bkey_val_bytes(k.k), sizeof(*bp.v)); + return -BCH_ERR_invalid_bkey; + } + + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + return -BCH_ERR_invalid_bkey; } + + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) { + prt_printf(err, "invalid min_key.snapshot (%u != 0)", + bp.v->min_key.snapshot); + return -BCH_ERR_invalid_bkey; + } + + return bch2_bkey_ptrs_invalid(c, k, flags, err); } -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bch_extent_ptr *ptr; + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -} + prt_printf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); -/* extent specific utility code */ + bch2_bpos_to_text(out, bp.v->min_key); + prt_printf(out, " "); + bch2_bkey_ptrs_to_text(out, c, k); +} -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) { - const struct bch_extent_ptr *ptr; + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); - return NULL; + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - const struct bch_extent_ptr *ptr; + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } - return NULL; -} + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) -{ - const struct bch_extent_ptr *ptr; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || + lp.has_ec != rp.has_ec) + return false; - extent_for_each_ptr(e, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return ptr; + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; - return NULL; -} + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; -unsigned bch2_extent_is_compressed(struct bkey_s_c k) -{ - unsigned ret = 0; + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + } - extent_for_each_ptr_decode(e, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE && - p.crc.compressed_size < p.crc.live_size) - ret += p.crc.compressed_size; - } + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } - return ret; -} + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_extent_ptr m, u64 offset) -{ - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; + } - extent_for_each_ptr_decode(e, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == - (s64) m.offset - offset) - return true; + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - return false; + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; } -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - union bch_extent_entry *i = ptrs.start; + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (i == entry) - return NULL; + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(*r.v)); + return -BCH_ERR_invalid_bkey; + } - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { + prt_printf(err, "invalid nr_replicas (%u)", + r.v->nr_replicas); + return -BCH_ERR_invalid_bkey; + } + + return 0; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; - bool drop_crc = true; - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; + prt_printf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) - break; +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; - break; - } + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return false; - dst = prev; - } + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; +/* Extent checksum entries: */ - return dst; +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { - return !u.compression_type && + return !crc_is_compressed(u) && u.csum_type && u.uncompressed_size > u.live_size && bch2_csum_type_is_encryption(u.csum_type) == bch2_csum_type_is_encryption(n.csum_type); } -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; - extent_for_each_crc(e, crc, i) + bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; @@ -415,9 +481,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; @@ -425,8 +491,8 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { - extent_for_each_crc(extent_i_to_s(e), u, i) - if (!u.compression_type && + bkey_for_each_crc(&k->k, ptrs, u, i) + if (!crc_is_compressed(u) && u.csum_type && u.live_size == u.uncompressed_size) { n = u; @@ -435,17 +501,19 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, return false; } found: - BUG_ON(n.compression_type); + BUG_ON(crc_is_compressed(n)); BUG_ON(n.offset); - BUG_ON(n.live_size != e->k.size); + BUG_ON(n.live_size != k->k.size); restart_narrow_pointers: - extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(k, &p); ret = true; goto restart_narrow_pointers; } @@ -453,1069 +521,442 @@ restart_narrow_pointers: return ret; } -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - } - } + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; - struct bch_dev *ca; + return bch2_bkey_devs(k).nr; +} - if (ptr->dev >= c->sb.nr_devices || - !c->devs[ptr->dev]) - return "pointer to invalid device"; +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; - bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && !crc_is_compressed(p.crc); + } - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; + return ret; +} - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && crc_is_compressed(p.crc)) + ret += p.crc.compressed_size; - return NULL; + return ret; } -static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +bool bch2_bkey_is_incompressible(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - const struct bch_extent_stripe_ptr *ec; - struct bch_dev *ca; - bool first = true; - bkey_extent_entry_for_each(ptrs, entry) { - if (!first) - pr_buf(out, " "); + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - ec = &entry->stripe_ptr; - - pr_buf(out, "ec: idx %llu block %u", - (u64) ec->idx, ec->block); - break; - default: - pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; - } - - first = false; - } -} - -/* Btree ptrs */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; - const char *reason; - - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; - - bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (!extent_entry_is_ptr(entry)) - return "has non ptr field"; - } - - bkey_for_each_ptr(ptrs, ptr) { - reason = extent_ptr_invalid(c, k, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; - } - - return NULL; -} - -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - unsigned seq; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; + struct extent_ptr_decoded p = { 0 }; unsigned replicas = 0; - bool bad; - - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) continue; - err = "stale"; - if (ptr_stale(ca, ptr)) - goto err; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - err = "inconsistent"; - if (bad) - goto err; - } - - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false)) { - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, - "btree key bad (replicas not marked in superblock):\n%s", - buf); - return; - } - - return; -err: - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); -} - -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const char *invalid; - - bkey_ptrs_to_text(out, c, k); - - invalid = bch2_btree_ptr_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); -} - -/* Extents */ - -bool __bch2_cut_front(struct bpos where, struct bkey_s k) -{ - u64 len = 0; - - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; - - EBUG_ON(bkey_cmp(where, k.k->p) > 0); - - len = k.k->p.offset - where.offset; - - BUG_ON(len > k.k->size); - - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) - k.k->type = KEY_TYPE_deleted; - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); - union bch_extent_entry *entry; - bool seen_crc = false; + if (p.has_ec) + replicas += p.ec.redundancy; - extent_for_each_entry(e, entry) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - if (!seen_crc) - entry->ptr.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - } + replicas++; - if (extent_entry_is_crc(entry)) - seen_crc = true; - } } - k.k->size = len; - - return true; + return replicas; } -bool bch2_cut_back(struct bpos where, struct bkey *k) +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - u64 len = 0; - - if (bkey_cmp(where, k->p) >= 0) - return false; - - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); - - len = where.offset - bkey_start_offset(k); - - BUG_ON(len > k->size); - - k->p = where; - k->size = len; + unsigned durability = 0; + struct bch_dev *ca; - if (!len) - k->type = KEY_TYPE_deleted; + if (p->ptr.cached) + return 0; - return true; -} + ca = bch_dev_bkey_exists(c, p->ptr.dev); -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -void bch2_key_resize(struct bkey *k, - unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} + if (ca->mi.state != BCH_MEMBER_STATE_failed) + durability = max_t(unsigned, durability, ca->mi.durability); -static bool extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - struct bkey_packed tmp; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = src->k; - else if (bch2_bkey_pack_key(&tmp, &src->k, f)) - memcpy_u64s(dst, &tmp, f->key_u64s); - else - return false; + if (p->has_ec) + durability += p->ec.redundancy; - memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); - return true; + return durability; } -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - - /* for deleting: */ - struct bkey_i whiteout; - bool update_journal; - bool update_btree; - bool deleting; -}; - -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -static void verify_extent_nonoverlapping(struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; -#endif -} + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c,& p); -static void verify_modified_extent(struct btree_iter *iter, - struct bkey_packed *k) -{ - bch2_btree_iter_verify(iter, iter->l[0].b); - bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); + return durability; } -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; - struct bkey_packed *k; - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(l->b, &l->iter, insert); - - node_iter = l->iter; - k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) - return; - - node_iter = l->iter; - k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) - return; + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); - k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); - bch2_btree_iter_verify(iter, l->b); + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); } -static void extent_insert_committed(struct extent_insert_state *s) +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = s->insert->k; - BKEY_PADDED(k) split; - - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - - bkey_copy(&split.k, insert); - if (s->deleting) - split.k.k.type = KEY_TYPE_discard; - - bch2_cut_back(s->committed, &split.k.k); - - if (!bkey_cmp(s->committed, iter->pos)) - return; - - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - - if (s->update_btree) { - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, - bkey_i_to_s_c(&split.k)); - - EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; - extent_bset_insert(c, iter, &split.k); + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; } - if (s->update_journal) { - bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout); - if (s->deleting) - split.k.k.type = KEY_TYPE_discard; - - bch2_cut_back(s->committed, &split.k.k); + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } - EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); - bch2_btree_journal_key(s->trans, iter, &split.k); + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); } - - bch2_cut_front(s->committed, insert); - - insert->k.needs_whiteout = false; } -void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) { - struct btree *b = iter->l[0].b; - - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + union bch_extent_entry *i = ptrs.start; - bch2_cut_back(b->key.k.p, &k->k); + if (i == entry) + return NULL; - BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0); + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; } -enum btree_insert_ret -bch2_extent_can_insert(struct btree_insert *trans, - struct btree_insert_entry *insert, - unsigned *u64s) +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) { - struct btree_iter_level *l = &insert->iter->l[0]; - struct btree_node_iter node_iter = l->iter; - enum bch_extent_overlap overlap; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_s_c k; - int sectors; - - BUG_ON(trans->flags & BTREE_INSERT_ATOMIC && - !bch2_extent_is_atomic(&insert->k->k, insert->iter)); - - /* - * We avoid creating whiteouts whenever possible when deleting, but - * those optimizations mean we may potentially insert two whiteouts - * instead of one (when we overlap with the front of one extent and the - * back of another): - */ - if (bkey_whiteout(&insert->k->k)) - *u64s += BKEY_U64s; - - _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - /* account for having to split existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; + union bch_extent_entry *next = extent_entry_next(entry); - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); - if (trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; } -static void -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); - break; - - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch2_cut_back(bkey_start_pos(&insert->k), k.k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); - break; - - case BCH_EXTENT_OVERLAP_ALL: { - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); - - k.k->size = 0; - k.k->type = KEY_TYPE_deleted; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; + bool drop_crc = true; - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, u64s, 0); - bch2_btree_iter_verify(iter, l->b); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { + break; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; + break; } - - break; } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bkey_written(l->b, _k); - - bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); - - extent_bset_insert(c, iter, &split.k); - break; - } - } -} - -static void __bch2_insert_fixup_extent(struct extent_insert_state *s) -{ - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); + extent_entry_drop(k, entry); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) break; - s->committed = bpos_min(s->insert->k->k.p, k.k->p); - - if (!bkey_whiteout(k.k)) - s->update_journal = true; - - if (!s->update_journal) { - bch2_cut_front(s->committed, insert); - bch2_cut_front(s->committed, &s->whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (s->deleting && - !s->update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - } - break; + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - s->update_btree = true; - } - - if (s->update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; - } - - extent_squash(s, insert, _k, k, overlap); - - if (!s->update_btree) - bch2_cut_front(s->committed, insert); -next: - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; } - if (bkey_cmp(s->committed, insert->k.p) < 0) - s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p); - - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - { - struct btree_node_iter node_iter = l->iter; - - while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, _k, &s->committed) > 0) - l->iter = node_iter; - } + return ret; } -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { - struct btree_iter *iter = insert->iter; - struct btree *b = iter->l[0].b; - struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = iter->pos, - - .whiteout = *insert->k, - .update_journal = !bkey_whiteout(&insert->k->k), - .update_btree = !bkey_whiteout(&insert->k->k), - .deleting = bkey_whiteout(&insert->k->k), - }; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k->k.size); + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + bch2_bkey_drop_ptr_noerror(k, ptr); /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: */ - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - - __bch2_insert_fixup_extent(&s); - - extent_insert_committed(&s); - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); - - if (insert->k->k.size) { - /* got to the end of this leaf node */ - BUG_ON(bkey_cmp(iter->pos, b->key.k.p)); - return BTREE_INSERT_NEED_TRAVERSE; - } - - return BTREE_INSERT_OK; -} - -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; - - if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e.s_c, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - } + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; } - return NULL; + return ret; } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket_mark mark; - unsigned seq, stale; - char buf[160]; - bool bad; - unsigned replicas = 0; - - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; - - stale = 0; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - /* between mark and bucket gen */ - smp_rmb(); - - stale = ptr_stale(ca, ptr); + struct bch_extent_ptr *ptr; - bch2_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} - bch2_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); - if (stale) - break; + if (ptr) + bch2_bkey_drop_ptr_noerror(k, ptr); +} - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_USER || - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors)); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - if (bad) - goto bad_ptr; - } + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; - if (replicas > BCH_REPLICAS_MAX) { - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); - return; - } + return NULL; +} - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, e.s_c, false)) { - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, - "extent key bad (replicas not marked in superblock):\n%s", - buf); - return; - } +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - return; + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; -bad_ptr: - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i type %u", buf, - PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); + return false; } -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) { - const char *invalid; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - bkey_ptrs_to_text(out, c, k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; - invalid = bch2_extent_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); + return false; } -static void bch2_extent_crc_init(union bch_extent_crc *crc, - struct bch_extent_crc_unpacked new) +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) { -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - ._compressed_size = _crc.compressed_size - 1, \ - ._uncompressed_size = _crc.uncompressed_size - 1, \ - .offset = _crc.offset + if (k1.k->type != k2.k->type) + return false; - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - common_fields(new), - .csum = *((__le32 *) &new.csum.lo), - }; - return; - } + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; - if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - common_fields(new), - .nonce = new.nonce, - .csum_lo = new.csum.lo, - .csum_hi = *((__le16 *) &new.csum.hi), - }; - return; - } + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; - if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - common_fields(new), - .nonce = new.nonce, - .csum = new.csum, - }; - return; + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; } -#undef common_fields - BUG(); } -void bch2_extent_crc_append(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked new) +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) { - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return &entry2->ptr; + + return NULL; } -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, - struct extent_ptr_decoded *p) +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bch_extent_crc_unpacked crc; - union bch_extent_entry *pos; - unsigned i; - - extent_for_each_crc(extent_i_to_s(e), crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) - goto found; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; - bch2_extent_crc_append(e, p->crc); - pos = extent_entry_last(extent_i_to_s(e)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ptr)); + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } - for (i = 0; i < p->ec_nr; i++) { - p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; } + + BUG(); } /* @@ -1534,243 +975,404 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - /* will only happen if all pointers were cached: */ - if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_deleted; - - return false; + return bkey_deleted(k.k); } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; + struct bch_dev *ca; + bool first = true; - if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { - int n = bch2_extent_ptr_durability(c, p); + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability(c, k)); - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } + bkey_extent_entry_for_each(ptrs, entry) { + if (!first) + prt_printf(out, " "); - if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { - int n = bch2_extent_ptr_durability(c, p); + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); } + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; + + prt_printf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; } + + first = false; + } } -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_i *l, struct bkey_i *r) +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) { - struct bkey_s_extent el = bkey_i_to_s_extent(l); - struct bkey_s_extent er = bkey_i_to_s_extent(r); - union bch_extent_entry *en_l, *en_r; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; + struct bch_dev *ca; - if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k)) - return BCH_MERGE_NOMERGE; + if (!bch2_dev_exists2(c, ptr->dev)) { + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - struct bch_dev *ca; + ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) { + prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - !extent_entry_is_ptr(en_l)) - return BCH_MERGE_NOMERGE; + if (bucket >= ca->mi.nbuckets) { + prt_printf(err, "pointer past last bucket (%llu > %llu)", + bucket, ca->mi.nbuckets); + return -BCH_ERR_invalid_bkey; + } - lp = &en_l->ptr; - rp = &en_r->ptr; + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { + prt_printf(err, "pointer before first bucket (%llu < %u)", + bucket, ca->mi.first_bucket); + return -BCH_ERR_invalid_bkey; + } - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + if (bucket_offset + size_ondisk > ca->mi.bucket_size) { + prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); + return -BCH_ERR_invalid_bkey; + } - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + return 0; +} - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - } +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; + unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; + bool unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret; - l->k.needs_whiteout |= r->k.needs_whiteout; + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; - } + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { + prt_printf(err, "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + return -BCH_ERR_invalid_bkey; + } - bch2_key_resize(&l->k, l->k.size + r->k.size); + if (bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry)) { + prt_printf(err, "has non ptr field"); + return -BCH_ERR_invalid_bkey; + } - return BCH_MERGE_MERGE; -} + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, + false, err); + if (ret) + return ret; + + if (nr_ptrs && unwritten != entry->ptr.unwritten) { + prt_printf(err, "extent with unwritten and written ptrs"); + return -BCH_ERR_invalid_bkey; + } -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch2_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->l[0].b; - struct btree_node_iter *node_iter = &iter->l[0].iter; - BKEY_PADDED(k) li, ri; - struct bkey_packed *m = back_merge ? l : r; - struct bkey_i *mi = back_merge ? &li.k : &ri.k; - struct bset_tree *t = bch2_bkey_to_bset(b, m); - enum merge_result ret; + if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { + prt_printf(err, "has unwritten ptrs"); + return -BCH_ERR_invalid_bkey; + } - EBUG_ON(bkey_written(b, m)); + if (entry->ptr.cached && have_ec) { + prt_printf(err, "cached, erasure coded ptr"); + return -BCH_ERR_invalid_bkey; + } - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bch2_bkey_unpack(b, &li.k, l); - bch2_bkey_unpack(b, &ri.k, r); + unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - ret = bch2_bkey_merge(c, &li.k, &ri.k); - if (ret == BCH_MERGE_NOMERGE) - return false; + if (crc.offset + crc.live_size > + crc.uncompressed_size) { + prt_printf(err, "checksum offset + key size > uncompressed size"); + return -BCH_ERR_invalid_bkey; + } - /* - * check if we overlap with deleted extents - would break the sort - * order: - */ - if (back_merge) { - struct bkey_packed *n = bkey_next(m); + size_ondisk = crc.compressed_size; - if (n != btree_bkey_last(b, t) && - bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && - bkey_deleted(n)) - return false; - } else if (ret == BCH_MERGE_MERGE) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + if (!bch2_checksum_type_valid(c, crc.csum_type)) { + prt_printf(err, "invalid checksum type"); + return -BCH_ERR_invalid_bkey; + } - if (prev && - bkey_cmp_left_packed_byval(b, prev, - bkey_start_pos(&li.k.k)) > 0) - return false; + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { + prt_printf(err, "invalid compression type"); + return -BCH_ERR_invalid_bkey; + } + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) { + prt_printf(err, "incorrect nonce"); + return -BCH_ERR_invalid_bkey; + } + } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + crc_since_last_ptr = true; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + have_ec = true; + break; + } } - if (ret == BCH_MERGE_PARTIAL) { - if (!extent_i_save(b, m, mi)) - return false; + if (!nr_ptrs) { + prt_str(err, "no ptrs"); + return -BCH_ERR_invalid_bkey; + } - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - } else { - if (!extent_i_save(b, m, &li.k)) - return false; + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; } - bch2_bset_fix_invalidated_key(b, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - m, m->u64s, m->u64s); - verify_modified_extent(iter, m); + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } - return ret == BCH_MERGE_MERGE; + return 0; } -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +void bch2_ptr_swab(struct bkey_s k) { - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - int ret = 0; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + u64 *d; - end.offset += size; + for (d = (u64 *) ptrs.start; + d != (u64 *) ptrs.end; + d++) + *d = swab64(*d); - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + for (entry = ptrs.start; + entry < ptrs.end; + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: break; - - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: break; } } - bch2_btree_iter_unlock(&iter); - - return ret; } -/* KEY_TYPE_reservation: */ +/* Generic extent code: */ -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_cut_front_s(struct bpos where, struct bkey_s k) { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 sub; - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; + if (bkey_le(where, bkey_start_pos(k.k))) + return 0; - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; + EBUG_ON(bkey_gt(where, k.k->p)); - return NULL; -} + sub = where.offset - bkey_start_offset(k.k); -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + k.k->size -= sub; - pr_buf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); + if (!k.k->size) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bool seen_crc = false; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + if (!seen_crc) + entry->ptr.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.offset += sub; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } + + if (extent_entry_is_crc(entry)) + seen_crc = true; + } + + break; + } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); + + sub = min_t(u64, sub << 9, bytes); + + memmove(p, p + sub, bytes - sub); + + new_val_u64s -= sub >> 3; + break; + } + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_i *l, struct bkey_i *r) +int bch2_cut_back_s(struct bpos where, struct bkey_s k) { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) - return BCH_MERGE_NOMERGE; + if (bkey_ge(where, k.k->p)) + return 0; - l->k.needs_whiteout |= r->k.needs_whiteout; + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; + len = where.offset - bkey_start_offset(k.k); + + k.k->p.offset = where.offset; + k.k->size = len; + + if (!len) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; + break; } - bch2_key_resize(&l->k, l->k.size + r->k.size); + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); - return BCH_MERGE_MERGE; + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; }