X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=82ec056f4cdbb1f4e4234fce274939b61b7a5015;hb=1f79cf3825e94fcb146d417b6dda9b94c93c7a53;hp=f66640c2a5edd73ad8c17059caed61df37a241b4;hpb=2b8c1bb0910534e8687ea3e5abf6d8bbba758247;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f66640c..82ec056 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "buckets.h" #include "checksum.h" +#include "compress.h" #include "debug.h" #include "disk_groups.h" #include "error.h" @@ -22,10 +23,9 @@ #include "replicas.h" #include "super.h" #include "super-io.h" +#include "trace.h" #include "util.h" -#include - static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -115,6 +115,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return -EIO; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; + ca = bch_dev_bkey_exists(c, p.ptr.dev); /* @@ -156,12 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) { - if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) - return "value too big"; + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, + btree_ptr_val_too_big, + "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return bch2_bkey_ptrs_invalid(c, k); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -170,35 +184,34 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) - return "value too small"; - - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + int ret = 0; - if (c->sb.version < bcachefs_metadata_version_snapshot && - bp.v->min_key.snapshot) - return "invalid min_key.snapshot"; + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, + btree_ptr_v2_val_too_big, + "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return bch2_bkey_ptrs_invalid(c, k); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) + struct bkey_s_c k) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx written %u min_key %s", + prt_printf(out, "seq %llx written %u min_key %s", le64_to_cpu(bp.v->seq), le16_to_cpu(bp.v->sectors_written), BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); bch2_bpos_to_text(out, bp.v->min_key); - pr_buf(out, " "); + prt_printf(out, " "); bch2_bkey_ptrs_to_text(out, c, k); } @@ -211,8 +224,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); if (version < bcachefs_metadata_version_inode_btree_change && - btree_node_type_is_extents(btree_id) && - bkey_cmp(bp.v->min_key, POS_MIN)) + btree_id_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) bp.v->min_key = write ? bpos_nosnap_predecessor(bp.v->min_key) : bpos_nosnap_successor(bp.v->min_key); @@ -220,17 +233,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, /* KEY_TYPE_extent: */ -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); @@ -265,6 +267,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) rp.ptr.offset + rp.crc.offset || lp.ptr.dev != rp.ptr.dev || lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || lp.has_ec != rp.has_ec) return false; @@ -287,7 +290,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= lp.crc.uncompressed_size) { /* can use left extent's crc entry */ - } else if (lp.crc.live_size <= rp.crc.offset ) { + } else if (lp.crc.live_size <= rp.crc.offset) { /* can use right extent's crc entry */ } else { /* check if checksums can be merged: */ @@ -303,10 +306,22 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (lp.crc.csum_type && lp.crc.uncompressed_size + - rp.crc.uncompressed_size > c->sb.encoded_extent_max) + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > + if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) return false; } @@ -334,7 +349,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (crc_l.offset + crc_l.live_size + crc_r.live_size <= crc_l.uncompressed_size) { /* can use left extent's crc entry */ - } else if (crc_l.live_size <= crc_r.offset ) { + } else if (crc_l.live_size <= crc_r.offset) { /* can use right extent's crc entry */ crc_r.offset -= crc_l.live_size; bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, @@ -363,17 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + int ret = 0; - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, + reservation_key_nr_replicas_invalid, + "invalid nr_replicas (%u)", r.v->nr_replicas); +fsck_err: + return ret; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -381,7 +397,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - pr_buf(out, "generation %u replicas %u", + prt_printf(out, "generation %u replicas %u", le32_to_cpu(r.v->generation), r.v->nr_replicas); } @@ -480,7 +496,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -506,13 +522,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, switch (type) { case BCH_EXTENT_ENTRY_crc32: set_common_fields(dst->crc32, src); - dst->crc32.csum = *((__le32 *) &src.csum.lo); + dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); break; case BCH_EXTENT_ENTRY_crc64: set_common_fields(dst->crc64, src); dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + dst->crc64.csum_lo = (u64 __force) src.csum.lo; + dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); break; case BCH_EXTENT_ENTRY_crc128: set_common_fields(dst->crc128, src); @@ -612,38 +628,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - - return ret; -} - unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -665,24 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return replicas; } -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) +static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) { - unsigned durability = 0; - struct bch_dev *ca; - - if (p.ptr.cached) + if (p->ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, p.ptr.dev); + return p->has_ec + ? p->ec.redundancy + 1 + : ca->mi.durability; +} - if (ca->mi.state != BCH_MEMBER_STATE_failed) - durability = max_t(unsigned, durability, ca->mi.durability); +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); - if (p.has_ec) - durability += p.ec.redundancy; + return __extent_ptr_durability(ca, p); +} - return durability; +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + + if (ca->mi.state == BCH_MEMBER_STATE_failed) + return 0; + + return __extent_ptr_durability(ca, p); } unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) @@ -693,40 +684,23 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) unsigned durability = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); + durability += bch2_extent_ptr_durability(c, &p); return durability; } -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) +static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } + unsigned durability = 0; - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) + durability += bch2_extent_ptr_durability(c, &p); - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } + return durability; } void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) @@ -738,41 +712,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry k->k.u64s -= extent_entry_u64s(entry); } -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) -{ - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); - } -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); -} - void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { @@ -817,41 +756,73 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; bool drop_crc = true; EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; - - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { break; - - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; break; } + } + + extent_entry_drop(k, entry); + + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; - dst = prev; + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + return ret; +} - return dst; +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + bch2_bkey_drop_ptr_noerror(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } + + return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -861,11 +832,17 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); + + if (ptr) + bch2_bkey_drop_ptr_noerror(k, ptr); +} + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->dev == dev) @@ -877,7 +854,6 @@ bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && @@ -905,6 +881,78 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, return false; } +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + if (k1.k->type != k2.k->type) + return false; + + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } +} + +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) +{ + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return &entry2->ptr; + + return NULL; +} + +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } + + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } + + BUG(); +} + /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -921,10 +969,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - /* will only happen if all pointers were cached: */ - if (!bch2_bkey_nr_ptrs(k.s_c)) - k.k->type = KEY_TYPE_deleted; - return bkey_deleted(k.k); } @@ -933,49 +977,76 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - const struct bch_extent_stripe_ptr *ec; - struct bch_dev *ca; bool first = true; + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); + bkey_extent_entry_for_each(ptrs, entry) { if (!first) - pr_buf(out, " "); + prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *ptr = entry_to_ptr(entry); + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } break; + } case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); break; - case BCH_EXTENT_ENTRY_stripe_ptr: - ec = &entry->stripe_ptr; + } + case BCH_EXTENT_ENTRY_stripe_ptr: { + const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; - pr_buf(out, "ec: idx %llu block %u", + prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; + } + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + prt_str(out, "rebalance: target "); + if (c) + bch2_target_to_text(out, c, r->target); + else + prt_printf(out, "%u", r->target); + prt_str(out, " compression "); + bch2_compression_opt_to_text(out, r->compression); + break; + } default: - pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; } @@ -983,106 +1054,178 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) +static int extent_ptr_invalid(struct bch_fs *c, + struct bkey_s_c k, + enum bkey_invalid_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; struct bch_dev *ca; + int ret = 0; - if (!bch2_dev_exists2(c, ptr->dev)) - return "pointer to invalid device"; + if (!bch2_dev_exists2(c, ptr->dev)) { + /* + * If we're in the write path this key might have already been + * overwritten, and we could be seeing a device that doesn't + * exist anymore due to racing with device removal: + */ + if (flags & BKEY_INVALID_WRITE) + return 0; - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; + bkey_fsck_err(c, err, ptr_to_invalid_device, + "pointer to invalid device (%u)", ptr->dev); + } + ca = bch_dev_bkey_exists(c, ptr->dev); bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; - - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; - - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; - - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; - - return NULL; + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, + ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); + + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + + bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); + bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + ptr_spans_multiple_buckets, + "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); +fsck_err: + return ret; } -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_devs_list devs; const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; - const char *reason; unsigned nonce = UINT_MAX; - unsigned i; + unsigned nr_ptrs = 0; + bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret = 0; - if (k.k->type == KEY_TYPE_btree_ptr || - k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = c->opts.btree_node_size; + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, + extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - if (k.k->type == KEY_TYPE_btree_ptr && - !extent_entry_is_ptr(entry)) - return "has non ptr field"; + bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry), c, err, + btree_ptr_has_non_ptr, + "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - reason = extent_ptr_invalid(c, k, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + ret = extent_ptr_invalid(c, k, flags, &entry->ptr, + size_ondisk, false, err); + if (ret) + return ret; + + bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, + ptr_cached_and_erasure_coded, + "cached, erasure coded ptr"); + + if (!entry->ptr.unwritten) + have_written = true; + else + have_unwritten = true; + + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - if (crc.offset + crc.live_size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) - return "invalid compression type"; + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, + ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, + ptr_crc_csum_type_unknown, + "invalid checksum type"); + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, + ptr_crc_compression_type_unknown, + "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; + bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + "incorrect nonce"); } + + bkey_fsck_err_on(crc_since_last_ptr, c, err, + ptr_crc_redundant, + "redundant crc entry"); + crc_since_last_ptr = true; + + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + + size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: + bkey_fsck_err_on(have_ec, c, err, + ptr_stripe_redundant, + "redundant stripe entry"); + have_ec = true; break; + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + if (!bch2_compression_opt_valid(r->compression)) { + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + prt_printf(err, "invalid compression opt %u:%u", + opt.type, opt.level); + return -BCH_ERR_invalid_bkey; + } + break; + } } } - devs = bch2_bkey_devs(k); - bubble_sort(devs.devs, devs.nr, u8_cmp); - for (i = 0; i + 1 < devs.nr; i++) - if (devs.devs[i] == devs.devs[i + 1]) - return "multiple ptrs to same device"; - - return NULL; + bkey_fsck_err_on(!nr_ptrs, c, err, + extent_ptrs_no_ptrs, + "no ptrs"); + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, + extent_ptrs_too_many_ptrs, + "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); + bkey_fsck_err_on(have_written && have_unwritten, c, err, + extent_ptrs_written_and_unwritten, + "extent with unwritten and written ptrs"); + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, + extent_ptrs_unwritten, + "has unwritten ptrs"); + bkey_fsck_err_on(crc_since_last_ptr, c, err, + extent_ptrs_redundant_crc, + "redundant crc entry"); + bkey_fsck_err_on(have_ec, c, err, + extent_ptrs_redundant_stripe, + "redundant stripe entry"); +fsck_err: + return ret; } void bch2_ptr_swab(struct bkey_s k) @@ -1117,8 +1260,129 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_stripe_ptr: break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + } +} + +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned rewrite_ptrs = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + rewrite_ptrs = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= 1U << i; + i++; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) + rewrite_ptrs |= 1U << i; + i++; } } + + return rewrite_ptrs; +} + +bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + /* + * If it's an indirect extent, we don't delete the rebalance entry when + * done so that we know what options were applied - check if it still + * needs work done: + */ + if (r && + k.k->type == KEY_TYPE_reflink_v && + !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) + r = NULL; + + return r != NULL; +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, + unsigned target, unsigned compression) +{ + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *r; + bool needs_rebalance; + + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + /* get existing rebalance entry: */ + r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + if (r) { + if (k.k->type == KEY_TYPE_reflink_v) { + /* + * indirect extents: existing options take precedence, + * so that we don't move extents back and forth if + * they're referenced by different inodes with different + * options: + */ + if (r->target) + target = r->target; + if (r->compression) + compression = r->compression; + } + + r->target = target; + r->compression = compression; + } + + needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); + + if (needs_rebalance && !r) { + union bch_extent_entry *new = bkey_val_end(k); + + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; + new->rebalance.compression = compression; + new->rebalance.target = target; + new->rebalance.unused = 0; + k.k->u64s += extent_entry_u64s(new); + } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { + /* + * For indirect extents, don't delete the rebalance entry when + * we're finished so that we know we specifically moved it or + * compressed it to its current location/compression type + */ + extent_entry_drop(k, (union bch_extent_entry *) r); + } + + return 0; } /* Generic extent code: */ @@ -1129,10 +1393,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 sub; - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) + if (bkey_le(where, bkey_start_pos(k.k))) return 0; - EBUG_ON(bkey_cmp(where, k.k->p) > 0); + EBUG_ON(bkey_gt(where, k.k->p)); sub = where.offset - bkey_start_offset(k.k); @@ -1167,6 +1431,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) break; case BCH_EXTENT_ENTRY_stripe_ptr: break; + case BCH_EXTENT_ENTRY_rebalance: + break; } if (extent_entry_is_crc(entry)) @@ -1209,10 +1475,10 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 len = 0; - if (bkey_cmp(where, k.k->p) >= 0) + if (bkey_ge(where, k.k->p)) return 0; - EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); len = where.offset - bkey_start_offset(k.k);