X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=9b197db78260b0997f056361a179398ee2cce247;hb=934a84dfaf719af82dadbbe0e2480baff03c905b;hp=792c9c1e50b1d1901db1f8ce90ad14a444827403;hpb=fad8236b812f795993b88804065d950709a6c13c;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 792c9c1..9b197db 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -26,6 +26,8 @@ #include +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -89,7 +91,7 @@ static inline bool ptr_better(struct bch_fs *c, return bch2_rand_range(l1 + l2) > l1; } - if (force_reconstruct_read(c)) + if (bch2_force_reconstruct_read) return p1.idx > p2.idx; return p1.idx < p2.idx; @@ -115,6 +117,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return -EIO; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; + ca = bch_dev_bkey_exists(c, p.ptr.dev); /* @@ -137,7 +146,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, !bch2_dev_is_readable(ca)) p.idx++; - if (force_reconstruct_read(c) && + if (bch2_force_reconstruct_read && !p.idx && p.has_ec) p.idx++; @@ -156,62 +165,63 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + return -BCH_ERR_invalid_bkey; + } - return bch2_bkey_ptrs_invalid(c, k); + return bch2_bkey_ptrs_invalid(c, k, rw, err); } -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - if (!percpu_down_read_trylock(&c->mark_lock)) - return; - - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + bch2_bkey_ptrs_to_text(out, c, k); +} - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - mark = ptr_bucket_mark(ca, ptr); + if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { + prt_printf(err, "value too small (%zu <= %zu)", + bkey_val_bytes(k.k), sizeof(*bp.v)); + return -BCH_ERR_invalid_bkey; + } - err = "stale"; - if (gen_after(mark.gen, ptr->gen)) - goto err; + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + return -BCH_ERR_invalid_bkey; + } - err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size) - goto err; + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) { + prt_printf(err, "invalid min_key.snapshot (%u != 0)", + bp.v->min_key.snapshot); + return -BCH_ERR_invalid_bkey; } -out: - percpu_up_read(&c->mark_lock); - return; -err: - bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); - goto out; + + return bch2_bkey_ptrs_invalid(c, k, rw, err); } -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + + prt_printf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); + + bch2_bpos_to_text(out, bp.v->min_key); + prt_printf(out, " "); bch2_bkey_ptrs_to_text(out, c, k); } @@ -225,194 +235,177 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bp.v->min_key, POS_MIN)) + !bkey_eq(bp.v->min_key, POS_MIN)) bp.v->min_key = write - ? bkey_predecessor(bp.v->min_key) - : bkey_successor(bp.v->min_key); + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } /* KEY_TYPE_extent: */ -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - char buf[160]; - - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - if (!percpu_down_read_trylock(&c->mark_lock)) - return; - - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - - extent_for_each_ptr_decode(e, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); - unsigned stale = gen_after(mark.gen, p.ptr.gen); - unsigned disk_sectors = ptr_disk_sectors(p); - unsigned mark_sectors = p.ptr.cached - ? mark.cached_sectors - : mark.dirty_sectors; - - bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_inconsistent_on(stale > 96, c, - "key too stale: %i", stale); - - bch2_fs_inconsistent_on(!stale && - (mark.data_type != BCH_DATA_USER || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); - } - - percpu_up_read(&c->mark_lock); -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; - - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + return false; - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; - break; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || + lp.has_ec != rp.has_ec) + return false; + + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; + + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; + + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; + + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; - - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_is_compressed(crc_l)) - return BCH_MERGE_NOMERGE; + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; + return false; } - } - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; - - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (!extent_entry_is_crc(en_l)) - continue; - - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* KEY_TYPE_reservation: */ -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(*r.v)); + return -BCH_ERR_invalid_bkey; + } - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { + prt_printf(err, "invalid nr_replicas (%u)", + r.v->nr_replicas); + return -BCH_ERR_invalid_bkey; + } - return NULL; + return 0; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -420,30 +413,22 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - pr_buf(out, "generation %u replicas %u", + prt_printf(out, "generation %u replicas %u", le32_to_cpu(r.v->generation), r.v->nr_replicas); } -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); if (l.v->generation != r.v->generation || l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - bch2_cut_front_s(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } + return false; bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* Extent checksum entries: */ @@ -527,7 +512,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -659,59 +644,43 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; - end.offset += size; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; - bch2_trans_init(&trans, c, 0, 0); + if (p.has_ec) + replicas += p.ec.redundancy; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; + replicas++; - if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { - ret = false; - break; - } } - bch2_trans_exit(&trans); - return ret; + return replicas; } -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { unsigned durability = 0; struct bch_dev *ca; - if (p.ptr.cached) + if (p->ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, p.ptr.dev); + ca = bch_dev_bkey_exists(c, p->ptr.dev); - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + if (ca->mi.state != BCH_MEMBER_STATE_failed) durability = max_t(unsigned, durability, ca->mi.durability); - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); - - if (WARN_ON(!s)) - goto out; + if (p->has_ec) + durability += p->ec.redundancy; - durability = max_t(unsigned, durability, s->nr_redundant); - } -out: return durability; } @@ -723,63 +692,18 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) unsigned durability = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); + durability += bch2_extent_ptr_durability(c,& p); return durability; } -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) -{ - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); - } + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); } static inline void __extent_entry_insert(struct bkey_i *k, @@ -838,41 +762,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + +/* + * Returns pointer to the next entry after the one being dropped: + */ +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; bool drop_crc = true; EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; - - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { break; - - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; break; } + } + + extent_entry_drop(k, entry); - dst = prev; + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; + + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + return ret; +} - return dst; +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + __bch2_bkey_drop_ptr(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } + + return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -882,6 +850,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); + + if (ptr) + __bch2_bkey_drop_ptr(k, ptr); +} + const struct bch_extent_ptr * bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) { @@ -926,6 +902,55 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, return false; } +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + if (k1.k->type != k2.k->type) + return false; + + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } +} + +bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, + struct bkey_s_c k2) +{ + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; +} + /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -942,11 +967,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - /* will only happen if all pointers were cached: */ - if (!bch2_bkey_nr_ptrs(k.s_c)) - k.k->type = KEY_TYPE_discard; - - return bkey_whiteout(k.k); + return bkey_deleted(k.k); } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -962,41 +983,53 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bkey_extent_entry_for_each(ptrs, entry) { if (!first) - pr_buf(out, " "); + prt_printf(out, " "); switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); break; case BCH_EXTENT_ENTRY_stripe_ptr: ec = &entry->stripe_ptr; - pr_buf(out, "ec: idx %llu block %u", + prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; default: - pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; } @@ -1004,68 +1037,101 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; struct bch_dev *ca; - if (!bch2_dev_exists2(c, ptr->dev)) - return "pointer to invalid device"; + if (!bch2_dev_exists2(c, ptr->dev)) { + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; - bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; + if (ptr != ptr2 && ptr->dev == ptr2->dev) { + prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; + if (bucket >= ca->mi.nbuckets) { + prt_printf(err, "pointer past last bucket (%llu > %llu)", + bucket, ca->mi.nbuckets); + return -BCH_ERR_invalid_bkey; + } - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { + prt_printf(err, "pointer before first bucket (%llu < %u)", + bucket, ca->mi.first_bucket); + return -BCH_ERR_invalid_bkey; + } - return NULL; + if (bucket_offset + size_ondisk > ca->mi.bucket_size) { + prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); + return -BCH_ERR_invalid_bkey; + } + + return 0; } -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; - const char *reason; unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; + bool unwritten = false; + int ret; - if (k.k->type == KEY_TYPE_btree_ptr) - size_ondisk = c->opts.btree_node_size; - if (k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { + prt_printf(err, "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + return -BCH_ERR_invalid_bkey; + } - if (k.k->type == KEY_TYPE_btree_ptr && - !extent_entry_is_ptr(entry)) - return "has non ptr field"; + if (bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry)) { + prt_printf(err, "has non ptr field"); + return -BCH_ERR_invalid_bkey; + } switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - reason = extent_ptr_invalid(c, k, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, + false, err); + if (ret) + return ret; + + if (nr_ptrs && unwritten != entry->ptr.unwritten) { + prt_printf(err, "extent with unwritten and written ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { + prt_printf(err, "has unwritten ptrs"); + return -BCH_ERR_invalid_bkey; + } + + unwritten = entry->ptr.unwritten; + nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: @@ -1073,22 +1139,30 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); if (crc.offset + crc.live_size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; + crc.uncompressed_size) { + prt_printf(err, "checksum offset + key size > uncompressed size"); + return -BCH_ERR_invalid_bkey; + } size_ondisk = crc.compressed_size; - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; + if (!bch2_checksum_type_valid(c, crc.csum_type)) { + prt_printf(err, "invalid checksum type"); + return -BCH_ERR_invalid_bkey; + } - if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) - return "invalid compression type"; + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { + prt_printf(err, "invalid compression type"); + return -BCH_ERR_invalid_bkey; + } if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; + else if (nonce != crc.offset + crc.nonce) { + prt_printf(err, "incorrect nonce"); + return -BCH_ERR_invalid_bkey; + } } break; case BCH_EXTENT_ENTRY_stripe_ptr: @@ -1096,7 +1170,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) } } - return NULL; + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -BCH_ERR_invalid_bkey; + } + + return 0; } void bch2_ptr_swab(struct bkey_s k) @@ -1143,10 +1222,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 sub; - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) + if (bkey_le(where, bkey_start_pos(k.k))) return 0; - EBUG_ON(bkey_cmp(where, k.k->p) > 0); + EBUG_ON(bkey_gt(where, k.k->p)); sub = where.offset - bkey_start_offset(k.k); @@ -1195,14 +1274,14 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) le64_add_cpu(&p.v->idx, sub); break; } - case KEY_TYPE_inline_data: { - struct bkey_s_inline_data d = bkey_s_to_inline_data(k); + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); - sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); + sub = min_t(u64, sub << 9, bytes); - memmove(d.v->data, - d.v->data + sub, - bkey_val_bytes(d.k) - sub); + memmove(p, p + sub, bytes - sub); new_val_u64s -= sub >> 3; break; @@ -1223,14 +1302,14 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 len = 0; - if (bkey_cmp(where, k.k->p) >= 0) + if (bkey_ge(where, k.k->p)) return 0; - EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); len = where.offset - bkey_start_offset(k.k); - k.k->p = where; + k.k->p.offset = where.offset; k.k->size = len; if (!len) { @@ -1240,7 +1319,9 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) switch (k.k->type) { case KEY_TYPE_inline_data: - new_val_u64s = min(new_val_u64s, k.k->size << 6); + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; break; }