X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=4fc581be7aaf545ab8ebd8b5d1f018bb768f62c0;hb=1ee7dc7a55273d34358a0ee525a9e823c999ffe6;hp=37470f86e588f15d20410a66645109cff1560352;hpb=90ef8b9f57c9114e82c41aef43db80776bbfaf82;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 37470f8..4fc581b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet * @@ -8,287 +9,465 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "debug.h" -#include "dirent.h" +#include "disk_groups.h" #include "error.h" #include "extents.h" #include "inode.h" #include "journal.h" +#include "replicas.h" #include "super.h" #include "super-io.h" #include "util.h" -#include "xattr.h" #include -static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, - struct bkey_i *, struct bkey_i *); +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); -static void sort_key_next(struct btree_node_iter *iter, - struct btree *b, - struct btree_node_iter_set *i) +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; + +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); + +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) +{ + struct bch_dev_io_failures *i; + + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; + + return NULL; +} + +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) { - i->k += __btree_node_offset_to_key(b, i->k)->u64s; + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - if (i->k == i->end) - *i = iter->data[--iter->used]; + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; + } } /* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. + * returns true if p1 is better than p2: */ -#define key_sort_cmp(h, l, r) \ -({ \ - bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)) \ - \ - ?: (l).k - (r).k; \ -}) - -static inline bool should_drop_next_key(struct btree_node_iter *iter, - struct btree *b) +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) { - struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; - struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - if (bkey_whiteout(k)) - return true; + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - if (iter->used < 2) - return false; + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; + } - if (iter->used > 2 && - key_sort_cmp(iter, r[0], r[1]) >= 0) - r++; + if (bch2_force_reconstruct_read) + return p1.idx > p2.idx; - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older and - * should be dropped. - */ - return !bkey_cmp_packed(b, - __btree_node_offset_to_key(b, l->k), - __btree_node_offset_to_key(b, r->k)); + return p1.idx < p2.idx; } -struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, - struct btree *b, - struct btree_node_iter *iter) +/* + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. + */ +int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { - struct bkey_packed *out = dst->start; - struct btree_nr_keys nr; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; + struct bch_dev *ca; + int ret = 0; - memset(&nr, 0, sizeof(nr)); + if (k.k->type == KEY_TYPE_error) + return -EIO; - heap_resort(iter, key_sort_cmp); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; - while (!bch2_btree_node_iter_end(iter)) { - if (!should_drop_next_key(iter, b)) { - struct bkey_packed *k = - __btree_node_offset_to_key(b, iter->data->k); + ca = bch_dev_bkey_exists(c, p.ptr.dev); - bkey_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } + /* + * If there are any dirty pointers it's an error if we can't + * read: + */ + if (!ret && !p.ptr.cached) + ret = -EIO; - sort_key_next(iter, b, iter->data); - heap_sift_down(iter, 0, key_sort_cmp); - } + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + continue; - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; -/* Common among btree and extent ptrs */ + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) -{ - const struct bch_extent_ptr *ptr; + if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) + p.idx++; - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + if (p.idx >= (unsigned) p.has_ec + 1) + continue; - return NULL; + if (ret > 0 && !ptr_better(c, p, *pick)) + continue; + + *pick = p; + ret = 1; + } + + return ret; } -bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - struct bch_extent_ptr *ptr; - bool dropped = false; + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + return -BCH_ERR_invalid_bkey; + } - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} - if (dropped) - bch2_extent_drop_redundant_crcs(e); - return dropped; +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); } -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - const struct bch_extent_ptr *ptr; + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { + prt_printf(err, "value too small (%zu <= %zu)", + bkey_val_bytes(k.k), sizeof(*bp.v)); + return -BCH_ERR_invalid_bkey; + } - if (ca->mi.group && - ca->mi.group == group) - return ptr; + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + return -BCH_ERR_invalid_bkey; } - return NULL; + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) { + prt_printf(err, "invalid min_key.snapshot (%u != 0)", + bp.v->min_key.snapshot); + return -BCH_ERR_invalid_bkey; + } + + return bch2_bkey_ptrs_invalid(c, k, flags, err); } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - const struct bch_extent_ptr *ptr; + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - extent_for_each_ptr(e, ptr) - if (dev_in_target(c->devs[ptr->dev], target)) - return ptr; + prt_printf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); - return NULL; + bch2_bpos_to_text(out, bp.v->min_key); + prt_printf(out, " "); + bch2_bkey_ptrs_to_text(out, c, k); } -unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) +void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) { - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); - extent_for_each_ptr(e, ptr) - nr_ptrs++; + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); - return nr_ptrs; + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } -unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; - extent_for_each_ptr(e, ptr) - nr_ptrs += !ptr->cached; - break; + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - case BCH_RESERVATION: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || + lp.has_ec != rp.has_ec) + return false; + + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; + + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; + + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; + + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } - return nr_ptrs; -} + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e) -{ - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } - extent_for_each_ptr(e, ptr) - nr_ptrs += (!ptr->cached && - bch_dev_bkey_exists(c, ptr->dev)->mi.state != - BCH_MEMBER_STATE_FAILED); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - return nr_ptrs; + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; } -unsigned bch2_extent_is_compressed(struct bkey_s_c k) +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - unsigned ret = 0; + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc.compression_type != BCH_COMPRESSION_NONE && - crc.compressed_size < crc.live_size) - ret = max_t(unsigned, ret, crc.compressed_size); + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(*r.v)); + return -BCH_ERR_invalid_bkey; } - return ret; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { + prt_printf(err, "invalid nr_replicas (%u)", + r.v->nr_replicas); + return -BCH_ERR_invalid_bkey; + } + + return 0; } -bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_extent_ptr m, u64 offset) +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - - extent_for_each_ptr_crc(e, ptr, crc) - if (ptr->dev == m.dev && - ptr->gen == m.gen && - (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) == - (s64) m.offset - offset) - return ptr; + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - return NULL; + prt_printf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); } -/* Doesn't cleanup redundant crcs */ -void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { - EBUG_ON(ptr < &e.v->start->ptr || - ptr >= &extent_entry_last(e)->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; } -void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) { - __bch2_extent_drop_ptr(e, ptr); - bch2_extent_drop_redundant_crcs(e); + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { - return !u.compression_type && + return !crc_is_compressed(u) && u.csum_type && u.uncompressed_size > u.live_size && bch2_csum_type_is_encryption(u.csum_type) == bch2_csum_type_is_encryption(n.csum_type); } -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; - extent_for_each_crc(e, crc, i) + bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; @@ -304,1655 +483,483 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; - struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; union bch_extent_entry *i; + bool ret = false; /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) - extent_for_each_crc(extent_i_to_s(e), u, i) - if (!u.compression_type && + if (!n.csum_type) { + bkey_for_each_crc(&k->k, ptrs, u, i) + if (!crc_is_compressed(u) && u.csum_type && u.live_size == u.uncompressed_size) { n = u; - break; + goto found; } - - if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n)) return false; - - BUG_ON(n.compression_type); + } +found: + BUG_ON(crc_is_compressed(n)); BUG_ON(n.offset); - BUG_ON(n.live_size != e->k.size); + BUG_ON(n.live_size != k->k.size); - bch2_extent_crc_append(e, n); restart_narrow_pointers: - extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u) - if (can_narrow_crc(u, n)) { - ptr->offset += u.offset; - extent_ptr_append(e, *ptr); - __bch2_extent_drop_ptr(extent_i_to_s(e), ptr); + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) + if (can_narrow_crc(p.crc, n)) { + __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(k, &p); + ret = true; goto restart_narrow_pointers; } - bch2_extent_drop_redundant_crcs(extent_i_to_s(e)); - return true; + return ret; } -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { - union bch_extent_entry *entry = e.v->start; - union bch_extent_crc *crc, *prev = NULL; - struct bch_extent_crc_unpacked u, prev_u; - - while (entry != extent_entry_last(e)) { - union bch_extent_entry *next = extent_entry_next(entry); - size_t crc_u64s = extent_entry_u64s(entry); - - if (!extent_entry_is_crc(entry)) - goto next; - - crc = entry_to_crc(entry); - u = bch2_extent_crc_unpack(e.k, crc); - - if (next == extent_entry_last(e)) { - /* crc entry with no pointers after it: */ - goto drop; - } +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset - if (extent_entry_is_crc(next)) { - /* no pointers before next crc entry: */ - goto drop; - } - - if (prev && !memcmp(&u, &prev_u, sizeof(u))) { - /* identical to previous crc entry: */ - goto drop; - } + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields +} - if (!prev && - !u.csum_type && - !u.compression_type) { - /* null crc entry: */ - union bch_extent_entry *e2; +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; - extent_for_each_entry_from(e, e2, extent_entry_next(entry)) { - if (!extent_entry_is_ptr(e2)) - break; + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); - e2->ptr.offset += u.offset; - } - goto drop; - } + bch2_extent_crc_pack(crc, new, type); - prev = crc; - prev_u = u; -next: - entry = next; - continue; -drop: - memmove_u64s_down(crc, next, - (u64 *) extent_entry_last(e) - (u64 *) next); - e.k->u64s -= crc_u64s; - } + k->k.u64s += extent_entry_u64s(ptrs.end); - EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } -static bool should_drop_ptr(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) { - return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr); + return bch2_bkey_devs(k).nr; } -static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) { - struct bch_extent_ptr *ptr = &e.v->start->ptr; - bool dropped = false; - - while ((ptr = extent_ptr_next(e, ptr))) - if (should_drop_ptr(c, e.c, ptr)) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } else - ptr++; - - if (dropped) - bch2_extent_drop_redundant_crcs(e); + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; } -static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk, - struct bkey_s k) +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) { - return bch2_extent_normalize(c, k); + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && !crc_is_compressed(p.crc); + } + + return ret; } -static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) { - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && crc_is_compressed(p.crc)) + ret += p.crc.compressed_size; - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_ptr: - break; - } - } - break; - } - } + return ret; } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) +bool bch2_bkey_is_incompressible(struct bkey_s_c k) { - const struct bch_extent_ptr *ptr2; - struct bch_dev *ca; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; - if (ptr->dev >= c->sb.nr_devices || - !c->devs[ptr->dev]) - return "pointer to invalid device"; + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; - extent_for_each_ptr(e, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; + if (p.has_ec) + replicas += p.ec.redundancy; - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; + replicas++; - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; + } - return NULL; + return replicas; } -static size_t extent_print_ptrs(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c_extent e) +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - char *out = buf, *end = buf + size; - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; + unsigned durability = 0; struct bch_dev *ca; - bool first = true; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + if (p->ptr.cached) + return 0; - extent_for_each_entry(e, entry) { - if (!first) - p(" "); + ca = bch_dev_bkey_exists(c, p->ptr.dev); - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); - break; - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; + if (ca->mi.state != BCH_MEMBER_STATE_failed) + durability = max_t(unsigned, durability, ca->mi.durability); - p("ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ca && ptr_stale(ca, ptr) - ? " stale" : ""); - break; - default: - p("(invalid extent entry %.16llx)", *((u64 *) entry)); - goto out; - } + if (p->has_ec) + durability += p->ec.redundancy; - first = false; - } -out: - if (bkey_extent_is_cached(e.k)) - p(" cached"); -#undef p - return out - buf; + return durability; } -static inline bool dev_latency_better(struct bch_dev *dev1, - struct bch_dev *dev2) +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) { - unsigned l1 = atomic_read(&dev1->latency[READ]); - unsigned l2 = atomic_read(&dev2->latency[READ]); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; - /* Pick at random, biased in favor of the faster device: */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c,& p); - return bch2_rand_range(l1 + l2) > l1; + return durability; } -static void extent_pick_read_device(struct bch_fs *c, - struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - - extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); - if (ptr->cached && ptr_stale(ca, ptr)) - continue; + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); +} - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - continue; +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; - if (avoid) { - if (test_bit(ca->dev_idx, avoid->d)) - continue; + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } - if (pick->ca && - test_bit(pick->ca->dev_idx, avoid->d)) - goto use; + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; } - if (pick->ca && !dev_latency_better(ca, pick->ca)) - continue; -use: - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (pick->ca) - percpu_ref_put(&pick->ca->io_ref); + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); - *pick = (struct extent_pick_ptr) { - .ptr = *ptr, - .crc = crc, - .ca = ca, - }; + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); } } -/* Btree ptrs */ +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} -static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k) +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) { - if (bkey_extent_is_cached(k.k)) - return "cached"; + union bch_extent_entry *next = extent_entry_next(entry); - if (k.k->size) - return "nonzero key size"; + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} - switch (k.k->type) { - case BCH_EXTENT: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; - const char *reason; +/* + * Returns pointer to the next entry after the one being dropped: + */ +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; + bool drop_crc = true; - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - if (extent_entry_is_crc(entry)) - return "has crc field"; + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { + break; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; + break; } + } - extent_for_each_ptr(e, ptr) { - reason = extent_ptr_invalid(c, e, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; - } + extent_entry_drop(k, entry); - return NULL; - } + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; - default: - return "invalid value type"; + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } } + + return ret; } -static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - unsigned seq; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - unsigned replicas = 0; - bool bad; - - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) - continue; - - err = "stale"; - if (ptr_stale(ca, ptr)) - goto err; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - err = "inconsistent"; - if (bad) - goto err; - } - - if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - bch2_fs_bug(c, - "btree key bad (replicas not marked in superblock):\n%s", - buf); - return; - } - - return; -err: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " - "gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.counter); -} - -static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - - invalid = bch2_btree_ptr_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p -} - -struct extent_pick_ptr -bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid) -{ - struct extent_pick_ptr pick = { .ca = NULL }; - - extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, &pick); - - return pick; -} - -const struct bkey_ops bch2_bkey_btree_ops = { - .key_invalid = bch2_btree_ptr_invalid, - .key_debugcheck = btree_ptr_debugcheck, - .val_to_text = bch2_btree_ptr_to_text, - .swab = bch2_ptr_swab, -}; - -/* Extents */ - -static bool __bch2_cut_front(struct bpos where, struct bkey_s k) -{ - u64 len = 0; - - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; - - EBUG_ON(bkey_cmp(where, k.k->p) > 0); - - len = k.k->p.offset - where.offset; - - BUG_ON(len > k.k->size); - - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) - __set_bkey_deleted(k.k); - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); - union bch_extent_entry *entry; - bool seen_crc = false; - - extent_for_each_entry(e, entry) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - if (!seen_crc) - entry->ptr.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += e.k->size - len; - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += e.k->size - len; - break; - } - - if (extent_entry_is_crc(entry)) - seen_crc = true; - } - } - - k.k->size = len; - - return true; -} - -bool bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - return __bch2_cut_front(where, bkey_i_to_s(k)); -} - -bool bch2_cut_back(struct bpos where, struct bkey *k) -{ - u64 len = 0; - - if (bkey_cmp(where, k->p) >= 0) - return false; - - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); - - len = where.offset - bkey_start_offset(k); - - BUG_ON(len > k->size); - - k->p = where; - k->size = len; - - if (!len) - __set_bkey_deleted(k); - - return true; -} - -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -void bch2_key_resize(struct bkey *k, - unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static bool __extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - bool ret; - - if ((dst_unpacked = packed_to_bkey(dst))) { - dst_unpacked->k = *src; - ret = true; - } else { - ret = bch2_bkey_pack_key(dst, src, f); - } - - if (ret && iter) - bch2_verify_key_order(b, iter, dst); - - return ret; -} - -static void extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - BUG_ON(!__extent_save(b, iter, dst, src)); -} - -/* - * If keys compare equal, compare by pointer order: - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -#define extent_sort_cmp(h, l, r) \ -({ \ - struct bkey _ul = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (l).k)); \ - struct bkey _ur = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - bkey_cmp(bkey_start_pos(&_ul), \ - bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ -}) - -static inline void extent_sort_sift(struct btree_node_iter *iter, - struct btree *b, size_t i) -{ - heap_sift_down(iter, i, extent_sort_cmp); -} - -static inline void extent_sort_next(struct btree_node_iter *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - sort_key_next(iter, b, i); - heap_sift_down(iter, i - iter->data, extent_sort_cmp); -} - -static void extent_sort_append(struct bch_fs *c, - struct btree *b, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, - struct bkey_packed *k) -{ - struct bkey_format *f = &b->format; - BKEY_PADDED(k) tmp; - - if (bkey_whiteout(k)) - return; - - bch2_bkey_unpack(b, &tmp.k, k); - - if (*prev && - bch2_extent_merge(c, b, (void *) *prev, &tmp.k)) - return; - - if (*prev) { - bch2_bkey_pack(*prev, (void *) *prev, f); - - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; - } - - bkey_copy(*prev, &tmp.k); -} - -struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *dst, - struct btree *b, - struct btree_node_iter *iter) -{ - struct bkey_format *f = &b->format; - struct btree_node_iter_set *_l = iter->data, *_r; - struct bkey_packed *prev = NULL, *out, *lk, *rk; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - heap_resort(iter, extent_sort_cmp); - - while (!bch2_btree_node_iter_end(iter)) { - lk = __btree_node_offset_to_key(b, _l->k); - - if (iter->used == 1) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - _r = iter->data + 1; - if (iter->used > 2 && - extent_sort_cmp(iter, _r[0], _r[1]) >= 0) - _r++; - - rk = __btree_node_offset_to_key(b, _r->k); - - l = __bkey_disassemble(b, lk, &l_unpacked); - r = __bkey_disassemble(b, rk, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_sort_next(iter, b, _r); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - sort_key_next(iter, b, _r); - } else { - __bch2_cut_front(l.k->p, r); - extent_save(b, NULL, rk, r.k); - } - - extent_sort_sift(iter, b, _r - iter->data); - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_reassemble(&tmp.k, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); - - __bch2_cut_front(r.k->p, l); - extent_save(b, NULL, lk, l.k); - - extent_sort_sift(iter, b, 0); - - extent_sort_append(c, b, &nr, dst->start, &prev, - bkey_to_packed(&tmp.k)); - } else { - bch2_cut_back(bkey_start_pos(r.k), l.k); - extent_save(b, NULL, lk, l.k); - } - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = dst->start; - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - struct bch_fs_usage stats; - - /* for deleting: */ - struct bkey_i whiteout; - bool do_journal; - bool deleting; -}; - -static void bch2_add_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - struct bch_fs *c = s->trans->c; - struct btree *b = s->insert->iter->l[0].b; - - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); - - if (!sectors) - return; - - bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq, 0); -} - -static void bch2_subtract_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - bch2_add_sectors(s, k, offset, -sectors); -} - -/* These wrappers subtract exactly the sectors that we're removing from @k */ -static void bch2_cut_subtract_back(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch2_subtract_sectors(s, k.s_c, where.offset, - k.k->p.offset - where.offset); - bch2_cut_back(where, k.k); -} - -static void bch2_cut_subtract_front(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), - where.offset - bkey_start_offset(k.k)); - __bch2_cut_front(where, k); -} - -static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) -{ - if (k.k->size) - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - k.k->size = 0; - __set_bkey_deleted(k.k); -} - -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) - -static enum btree_insert_ret -extent_insert_should_stop(struct extent_insert_state *s) -{ - struct btree *b = s->insert->iter->l[0].b; - - /* - * Check if we have sufficient space in both the btree node and the - * journal reservation: - * - * Each insert checks for room in the journal entry, but we check for - * room in the btree node up-front. In the worst case, bkey_cmpxchg() - * will insert two keys, and one iteration of this room will insert one - * key, so we need room for three keys. - */ - if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; - else if (!journal_res_insert_fits(s->trans, s->insert)) - return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ - else - return BTREE_INSERT_OK; -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bset_tree *t = bset_tree_last(l->b); - struct bkey_packed *where = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); - struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where); - struct bkey_packed *next_live_key = where; - unsigned clobber_u64s; - - if (prev) - where = bkey_next(prev); - - while (next_live_key != btree_bkey_last(l->b, t) && - bkey_deleted(next_live_key)) - next_live_key = bkey_next(next_live_key); - - /* - * Everything between where and next_live_key is now deleted keys, and - * is overwritten: - */ - clobber_u64s = (u64 *) next_live_key - (u64 *) where; - - if (prev && - bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) - goto drop_deleted_keys; - - if (next_live_key != btree_bkey_last(l->b, t) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), - next_live_key, false)) - goto drop_deleted_keys; - - bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where, - clobber_u64s, where->u64s); - return; -drop_deleted_keys: - bch2_bset_delete(l->b, where, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - where, clobber_u64s, 0); -} - -static void extent_insert_committed(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = !s->deleting - ? s->insert->k - : &s->whiteout; - BKEY_PADDED(k) split; - - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - - if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) - return; - - if (s->deleting && !s->do_journal) { - bch2_cut_front(s->committed, insert); - goto done; - } - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - bkey_copy(&split.k, insert); - - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bch2_extent_is_compressed(bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch2_cut_subtract_back(s, s->committed, - bkey_i_to_s(&split.k)); - bch2_cut_front(s->committed, insert); - bch2_add_sectors(s, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size); - } else { - bch2_cut_back(s->committed, &split.k.k); - bch2_cut_front(s->committed, insert); - } - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k)); - - bch2_btree_journal_key(s->trans, iter, &split.k); - - if (!s->deleting) - extent_bset_insert(c, iter, &split.k); -done: - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - - insert->k.needs_whiteout = false; - s->do_journal = false; - s->trans->did_work = true; -} - -static enum btree_insert_ret -__extent_insert_advance_pos(struct extent_insert_state *s, - struct bpos next_pos, - struct bkey_s_c k) -{ - struct extent_insert_hook *hook = s->trans->hook; - enum btree_insert_ret ret; - - if (hook) - ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); - else - ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - - if (ret == BTREE_INSERT_OK) - s->committed = next_pos; - - return ret; -} - -/* - * Update iter->pos, marking how much of @insert we've processed, and call hook - * fn: - */ -static enum btree_insert_ret -extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) -{ - struct btree *b = s->insert->iter->l[0].b; - struct bpos next_pos = bpos_min(s->insert->k->k.p, - k.k ? k.k->p : b->key.k.p); - enum btree_insert_ret ret; - - if (race_fault()) - return BTREE_INSERT_NEED_TRAVERSE; - - /* hole? */ - if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null); - if (ret != BTREE_INSERT_OK) - return ret; - } - - /* avoid redundant calls to hook fn: */ - if (!bkey_cmp(s->committed, next_pos)) - return BTREE_INSERT_OK; - - return __extent_insert_advance_pos(s, next_pos, k); -} - -static enum btree_insert_ret -extent_insert_check_split_compressed(struct extent_insert_state *s, - struct bkey_s_c k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - unsigned sectors; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - - if (s->trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; - - switch (bch2_disk_reservation_add(c, - s->trans->disk_res, - sectors * bch2_extent_nr_dirty_ptrs(k), - flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} - -static enum btree_insert_ret -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - enum btree_insert_ret ret; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - bch2_cut_subtract_front(s, insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - break; - - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); - break; - - case BCH_EXTENT_OVERLAP_ALL: { - struct bpos orig_pos = k.k->p; - - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - - bch2_drop_subtract(s, k); - k.k->p = bkey_start_pos(&insert->k); - if (!__extent_save(b, node_iter, _k, k.k)) { - /* - * Couldn't repack: we aren't necessarily able - * to repack if the new key is outside the range - * of the old extent, so we have to split - * @insert: - */ - k.k->p = orig_pos; - extent_save(b, node_iter, _k, k.k); - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret != BTREE_INSERT_OK) - return ret; - - extent_insert_committed(s); - /* - * We split and inserted upto at k.k->p - that - * has to coincide with iter->pos, so that we - * don't have anything more we have to insert - * until we recheck our journal reservation: - */ - EBUG_ON(bkey_cmp(s->committed, k.k->p)); - } else { - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); - } - - break; - } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); - - bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - bch2_cut_subtract_front(s, insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - - bch2_add_sectors(s, bkey_i_to_s_c(&split.k), - bkey_start_offset(&split.k.k), - split.k.k.size); - extent_bset_insert(c, iter, &split.k); - break; - } - } - - return BTREE_INSERT_OK; -} - -static enum btree_insert_ret -bch2_delete_fixup_extent(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - s->whiteout = *insert; - s->do_journal = false; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (bkey_whiteout(k.k)) { - s->committed = bpos_min(insert->k.p, k.k->p); - goto next; - } - - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) - goto stop; - - s->do_journal = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL) { - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - _k->type = KEY_TYPE_DISCARD; - reserve_whiteout(b, t, _k); - } else if (k.k->needs_whiteout || - bset_written(b, bset(b, t))) { - struct bkey_i discard = *insert; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - bch2_cut_front(bkey_start_pos(k.k), &discard); - break; - case BCH_EXTENT_OVERLAP_BACK: - bch2_cut_back(k.k->p, &discard.k); - break; - default: - break; - } - - discard.k.needs_whiteout = true; - - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - - extent_bset_insert(c, iter, &discard); - } else { - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - } -next: - bch2_cut_front(s->committed, insert); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - } - - if (ret == BTREE_INSERT_OK && - bkey_cmp(s->committed, insert->k.p) < 0) - ret = extent_insert_advance_pos(s, bkey_s_c_null); -stop: - extent_insert_committed(s); - - bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res, - gc_pos_btree_node(b)); - - EBUG_ON(bkey_cmp(iter->pos, s->committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); - - bch2_cut_front(iter->pos, insert); - - if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; - - EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); - - return ret; -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = insert->iter->pos, - .deleting = bkey_whiteout(&insert->k->k), - }; - - EBUG_ON(iter->level); - EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); - - if (s.deleting) - return bch2_delete_fixup_extent(&s); - - /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: - */ - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - - while (bkey_cmp(s.committed, insert->k->k.p) < 0 && - (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - ret = extent_insert_check_split_compressed(&s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - - if (!k.k->size) - goto squash; - - /* - * Only call advance pos & call hook for nonzero size extents: - */ - ret = extent_insert_advance_pos(&s, k.s_c); - if (ret != BTREE_INSERT_OK) - goto stop; - - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) - insert->k->k.needs_whiteout = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); - _k->needs_whiteout = false; - } -squash: - ret = extent_squash(&s, insert->k, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - } - - if (ret == BTREE_INSERT_OK && - bkey_cmp(s.committed, insert->k->k.p) < 0) - ret = extent_insert_advance_pos(&s, bkey_s_c_null); -stop: - extent_insert_committed(&s); - /* - * Subtract any remaining sectors from @insert, if we bailed out early - * and didn't fully insert @insert: - */ - if (insert->k->k.size && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - - bch2_fs_usage_apply(c, &s.stats, trans->disk_res, - gc_pos_btree_node(b)); - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); - - if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; - - EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); - - return ret; -} - -static const char *bch2_extent_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - if (!k.k->size) - return "zero key size"; - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (extent_entry_is_crc(entry)) { - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - } else { - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - } - } - - return NULL; - } - - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; - } - - default: - return "invalid value type"; - } -} - -static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, - struct bkey_s_c_extent e) -{ - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket_mark mark; - unsigned seq, stale; - char buf[160]; - bool bad; - unsigned replicas = 0; + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + __bch2_bkey_drop_ptr(k, ptr); /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; + return ret; +} - stale = 0; +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} - /* between mark and bucket gen */ - smp_rmb(); +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); - stale = ptr_stale(ca, ptr); + if (ptr) + __bch2_bkey_drop_ptr(k, ptr); +} - bch2_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - bch2_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; - if (stale) - break; + return NULL; +} - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_USER || - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors)); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - if (bad) - goto bad_ptr; - } + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; - if (replicas > BCH_REPLICAS_MAX) { - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); - return; - } + return false; +} - if (!bkey_extent_is_cached(e.k) && - !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (replicas not marked in superblock):\n%s", - buf); - return; - } +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - return; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; -bad_ptr: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i type %u", buf, - PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); - return; + return false; } -static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); - break; - case BCH_RESERVATION: - break; - default: - BUG(); - } -} + if (k1.k->type != k2.k->type) + return false; -static void bch2_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; - invalid = bch2_extent_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } } -static void bch2_extent_crc_init(union bch_extent_crc *crc, - struct bch_extent_crc_unpacked new) +bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, + struct bkey_s_c k2) { -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - ._compressed_size = _crc.compressed_size - 1, \ - ._uncompressed_size = _crc.uncompressed_size - 1, \ - .offset = _crc.offset - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - common_fields(new), - .csum = *((__le32 *) &new.csum.lo), - }; - return; - } + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; - if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - common_fields(new), - .nonce = new.nonce, - .csum_lo = new.csum.lo, - .csum_hi = *((__le16 *) &new.csum.hi), - }; - return; - } + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; - if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - common_fields(new), - .nonce = new.nonce, - .csum = new.csum, - }; - return; - } -#undef common_fields - BUG(); + return false; } -void bch2_extent_crc_append(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked new) +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; - BUG_ON(new.compressed_size > new.uncompressed_size); - BUG_ON(new.live_size != e->k.size); - BUG_ON(!new.compressed_size || !new.uncompressed_size); - - /* - * Look up the last crc entry, so we can check if we need to add - * another: - */ - extent_for_each_crc(extent_i_to_s(e), crc, i) - ; + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } - if (!memcmp(&crc, &new, sizeof(crc))) - return; + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + BUG(); } /* @@ -1965,436 +972,407 @@ void bch2_extent_crc_append(struct bkey_i_extent *e, */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bkey_s_extent e; + struct bch_extent_ptr *ptr; - switch (k.k->type) { - case KEY_TYPE_ERROR: - return false; + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - case KEY_TYPE_DELETED: - case KEY_TYPE_COOKIE: - return true; + return bkey_deleted(k.k); +} - case KEY_TYPE_DISCARD: - return bversion_zero(k.k->version); +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; + struct bch_dev *ca; + bool first = true; - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_to_extent(k); + bkey_extent_entry_for_each(ptrs, entry) { + if (!first) + prt_printf(out, " "); - bch2_extent_drop_stale(c, e); + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; - if (!bkey_val_u64s(e.k)) { - if (bkey_extent_is_cached(e.k)) { - k.k->type = KEY_TYPE_DISCARD; - if (bversion_zero(k.k->version)) - return true; + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); } else { - k.k->type = KEY_TYPE_ERROR; + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); } + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; + + prt_printf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; } - return false; - case BCH_RESERVATION: - return false; - default: - BUG(); + first = false; } } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned nr_desired_replicas, - unsigned target) +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) { - struct bch_extent_ptr *ptr; - unsigned nr_cached = 0, nr_good = bch2_extent_nr_good_ptrs(c, e.c); - - if (nr_good <= nr_desired_replicas) - return; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; + struct bch_dev *ca; - nr_cached = nr_good - nr_desired_replicas; + if (!bch2_dev_exists2(c, ptr->dev)) { + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } - extent_for_each_ptr(e, ptr) - if (!ptr->cached && - !dev_in_target(c->devs[ptr->dev], target)) { - ptr->cached = true; - nr_cached--; - if (!nr_cached) - return; + ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) { + prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; } -} - -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *ret) -{ - struct bkey_s_c_extent e; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_COOKIE: - ret->ca = NULL; - return; - - case KEY_TYPE_ERROR: - ret->ca = ERR_PTR(-EIO); - return; - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - ret->ca = NULL; + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret); - - if (!ret->ca && !bkey_extent_is_cached(e.k)) - ret->ca = ERR_PTR(-EIO); - return; + if (bucket >= ca->mi.nbuckets) { + prt_printf(err, "pointer past last bucket (%llu > %llu)", + bucket, ca->mi.nbuckets); + return -BCH_ERR_invalid_bkey; + } - case BCH_RESERVATION: - ret->ca = NULL; - return; + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { + prt_printf(err, "pointer before first bucket (%llu < %u)", + bucket, ca->mi.first_bucket); + return -BCH_ERR_invalid_bkey; + } - default: - BUG(); + if (bucket_offset + size_ondisk > ca->mi.bucket_size) { + prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); + return -BCH_ERR_invalid_bkey; } + + return 0; } -static enum merge_result bch2_extent_merge(struct bch_fs *c, - struct btree *bk, - struct bkey_i *l, struct bkey_i *r) +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) { - struct bkey_s_extent el, er; - union bch_extent_entry *en_l, *en_r; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; + unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; + bool unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret; - if (key_merging_disabled(c)) - return BCH_MERGE_NOMERGE; + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); - /* - * Generic header checks - * Assumes left and right are in order - * Left and right must be exactly aligned - */ + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { + prt_printf(err, "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + return -BCH_ERR_invalid_bkey; + } - if (l->k.u64s != r->k.u64s || - l->k.type != r->k.type || - bversion_cmp(l->k.version, r->k.version) || - bkey_cmp(l->k.p, bkey_start_pos(&r->k))) - return BCH_MERGE_NOMERGE; - - switch (l->k.type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_ERROR: - /* These types are mergeable, and no val to check */ - break; + if (bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry)) { + prt_printf(err, "has non ptr field"); + return -BCH_ERR_invalid_bkey; + } + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, + false, err); + if (ret) + return ret; - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - el = bkey_i_to_s_extent(l); - er = bkey_i_to_s_extent(r); + if (nr_ptrs && unwritten != entry->ptr.unwritten) { + prt_printf(err, "extent with unwritten and written ptrs"); + return -BCH_ERR_invalid_bkey; + } - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - struct bch_dev *ca; + if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { + prt_printf(err, "has unwritten ptrs"); + return -BCH_ERR_invalid_bkey; + } - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); + if (entry->ptr.cached && have_ec) { + prt_printf(err, "cached, erasure coded ptr"); + return -BCH_ERR_invalid_bkey; + } - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) - return BCH_MERGE_NOMERGE; + unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - lp = &en_l->ptr; - rp = &en_r->ptr; + if (crc.offset + crc.live_size > + crc.uncompressed_size) { + prt_printf(err, "checksum offset + key size > uncompressed size"); + return -BCH_ERR_invalid_bkey; + } - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + size_ondisk = crc.compressed_size; - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + if (!bch2_checksum_type_valid(c, crc.csum_type)) { + prt_printf(err, "invalid checksum type"); + return -BCH_ERR_invalid_bkey; + } - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - } + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { + prt_printf(err, "invalid compression type"); + return -BCH_ERR_invalid_bkey; + } - break; - case BCH_RESERVATION: { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) { + prt_printf(err, "incorrect nonce"); + return -BCH_ERR_invalid_bkey; + } + } - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) - return BCH_MERGE_NOMERGE; - break; + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + crc_since_last_ptr = true; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + have_ec = true; + break; + } } - default: - return BCH_MERGE_NOMERGE; + + if (!nr_ptrs) { + prt_str(err, "no ptrs"); + return -BCH_ERR_invalid_bkey; } - l->k.needs_whiteout |= r->k.needs_whiteout; + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -BCH_ERR_invalid_bkey; + } - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; } - bch2_key_resize(&l->k, l->k.size + r->k.size); + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } - return BCH_MERGE_MERGE; + return 0; } -static void extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) +void bch2_ptr_swab(struct bkey_s k) { - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + u64 *d; - BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); - - /* - * We don't want the bch2_verify_key_order() call in extent_save(), - * because we may be out of order with deleted keys that are about to be - * removed by extent_bset_insert() - */ + for (d = (u64 *) ptrs.start; + d != (u64 *) ptrs.end; + d++) + *d = swab64(*d); - if ((dst_unpacked = packed_to_bkey(dst))) - bkey_copy(dst_unpacked, src); - else - BUG_ON(!bch2_bkey_pack(dst, src, f)); + for (entry = ptrs.start; + entry < ptrs.end; + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } + } } -static bool extent_merge_one_overlapping(struct btree_iter *iter, - struct bpos new_pos, - struct bset_tree *t, - struct bkey_packed *k, struct bkey uk, - bool check, bool could_pack) +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos where, struct bkey_s k) { - struct btree_iter_level *l = &iter->l[0]; + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 sub; - BUG_ON(!bkey_deleted(k)); + if (bkey_le(where, bkey_start_pos(k.k))) + return 0; - if (check) { - return !bkey_packed(k) || could_pack; - } else { - uk.p = new_pos; - extent_save(l->b, &l->iter, k, &uk); - bch2_bset_fix_invalidated_key(l->b, t, k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - k, k->u64s, k->u64s); - return true; - } -} + EBUG_ON(bkey_gt(where, k.k->p)); -static bool extent_merge_do_overlapping(struct btree_iter *iter, - struct bkey *m, bool back_merge) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bset_tree *t; - struct bkey_packed *k; - struct bkey uk; - struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); - bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); - bool check = true; + sub = where.offset - bkey_start_offset(k.k); - /* - * @m is the new merged extent: - * - * The merge took place in the last bset; we know there can't be any 0 - * size extents overlapping with m there because if so they would have - * been between the two extents we merged. - * - * But in the other bsets, we have to check for and fix such extents: - */ -do_fixup: - for_each_bset(b, t) { - if (t == bset_tree_last(b)) - break; + k.k->size -= sub; - /* - * if we don't find this bset in the iterator we already got to - * the end of that bset, so start searching from the end. - */ - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + if (!k.k->size) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - if (!k) - continue; + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bool seen_crc = false; - if (back_merge) { - /* - * Back merge: 0 size extents will be before the key - * that was just inserted (and thus the iterator - * position) - walk backwards to find them - */ - for (; - k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(m)) > 0); - k = bch2_bkey_prev_all(b, t, k)) { - if (bkey_cmp(uk.p, m->p) >= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } else { - /* Front merge - walk forwards */ - for (; - k != btree_bkey_last(b, t) && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, m->p) < 0); - k = bkey_next(k)) { - if (bkey_cmp(uk.p, - bkey_start_pos(m)) <= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; + bkey_extent_entry_for_each(ptrs, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + if (!seen_crc) + entry->ptr.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.offset += sub; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } + + if (extent_entry_is_crc(entry)) + seen_crc = true; } - } - if (check) { - check = false; - goto do_fixup; + break; } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - return true; -} - -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch2_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->l[0].b; - struct btree_node_iter *node_iter = &iter->l[0].iter; - const struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *m; - BKEY_PADDED(k) li; - BKEY_PADDED(k) ri; - struct bkey_i *mi; - struct bkey tmp; - - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bch2_bkey_unpack(b, &li.k, l); - bch2_bkey_unpack(b, &ri.k, r); + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); - m = back_merge ? l : r; - mi = back_merge ? &li.k : &ri.k; + sub = min_t(u64, sub << 9, bytes); - /* l & r should be in last bset: */ - EBUG_ON(bch2_bkey_to_bset(b, m) != t); + memmove(p, p + sub, bytes - sub); - switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { - case BCH_MERGE_NOMERGE: - return false; - case BCH_MERGE_PARTIAL: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) - return false; + new_val_u64s -= sub >> 3; + break; + } + } - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) - return false; + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); - extent_i_save(b, m, mi); - bch2_bset_fix_invalidated_key(b, t, m); + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +} - /* - * Update iterator to reflect what we just inserted - otherwise, - * the iter_fix() call is going to put us _before_ the key we - * just partially merged with: - */ - if (back_merge) - bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); +int bch2_cut_back_s(struct bpos where, struct bkey_s k) +{ + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); + if (bkey_ge(where, k.k->p)) + return 0; - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - return false; - case BCH_MERGE_MERGE: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) - return false; + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) - return false; + len = where.offset - bkey_start_offset(k.k); - extent_i_save(b, m, &li.k); - bch2_bset_fix_invalidated_key(b, t, m); + k.k->p.offset = where.offset; + k.k->size = len; - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); - return true; - default: - BUG(); + if (!len) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; } -} - -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) -{ - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - int ret = 0; - - end.offset += size; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; - break; - } + switch (k.k->type) { + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; + break; } - bch2_btree_iter_unlock(&iter); - return ret; -} + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); -const struct bkey_ops bch2_bkey_extent_ops = { - .key_invalid = bch2_extent_invalid, - .key_debugcheck = bch2_extent_debugcheck, - .val_to_text = bch2_extent_to_text, - .swab = bch2_ptr_swab, - .key_normalize = bch2_ptr_normalize, - .key_merge = bch2_extent_merge, - .is_extents = true, -}; + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +}