#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
+#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
+#include "trace.h"
#include "util.h"
-#include <trace/events/bcachefs.h>
-
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
return -EIO;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ /*
+ * Unwritten extent: no need to actually read, treat it as a
+ * hole and return 0s:
+ */
+ if (p.ptr.unwritten)
+ return 0;
+
ca = bch_dev_bkey_exists(c, p.ptr.dev);
/*
/* KEY_TYPE_btree_ptr: */
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
- return "value too big";
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+ btree_ptr_val_too_big,
+ "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- return bch2_bkey_ptrs_invalid(c, k);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
{
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
- if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
- return "value too small";
-
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
+ int ret = 0;
- if (c->sb.version < bcachefs_metadata_version_snapshot &&
- bp.v->min_key.snapshot)
- return "invalid min_key.snapshot";
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+ btree_ptr_v2_val_too_big,
+ "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- return bch2_bkey_ptrs_invalid(c, k);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+ struct bkey_s_c k)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx written %u min_key ",
+ prt_printf(out, "seq %llx written %u min_key %s",
le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors_written));
+ le16_to_cpu(bp.v->sectors_written),
+ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
bch2_bpos_to_text(out, bp.v->min_key);
- pr_buf(out, " ");
+ prt_printf(out, " ");
bch2_bkey_ptrs_to_text(out, c, k);
}
compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bp.v->min_key, POS_MIN))
+ btree_id_is_extents(btree_id) &&
+ !bkey_eq(bp.v->min_key, POS_MIN))
bp.v->min_key = write
? bpos_nosnap_predecessor(bp.v->min_key)
: bpos_nosnap_successor(bp.v->min_key);
/* KEY_TYPE_extent: */
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
rp.ptr.offset + rp.crc.offset ||
lp.ptr.dev != rp.ptr.dev ||
lp.ptr.gen != rp.ptr.gen ||
+ lp.ptr.unwritten != rp.ptr.unwritten ||
lp.has_ec != rp.has_ec)
return false;
if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
lp.crc.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (lp.crc.live_size <= rp.crc.offset ) {
+ } else if (lp.crc.live_size <= rp.crc.offset) {
/* can use right extent's crc entry */
} else {
/* check if checksums can be merged: */
if (lp.crc.csum_type &&
lp.crc.uncompressed_size +
- rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
return false;
+ }
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
- if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
bch2_crc_field_size_max[extent_entry_type(en_l)])
return false;
}
if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
crc_l.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (crc_l.live_size <= crc_r.offset ) {
+ } else if (crc_l.live_size <= crc_r.offset) {
/* can use right extent's crc entry */
crc_r.offset -= crc_l.live_size;
bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
/* KEY_TYPE_reservation: */
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+ int ret = 0;
- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
- return "incorrect value size";
-
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
- return "invalid nr_replicas";
-
- return NULL;
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+ reservation_key_nr_replicas_invalid,
+ "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+ return ret;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- pr_buf(out, "generation %u replicas %u",
+ prt_printf(out, "generation %u replicas %u",
le32_to_cpu(r.v->generation),
r.v->nr_replicas);
}
bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
bch2_extent_ptr_decoded_append(k, &p);
switch (type) {
case BCH_EXTENT_ENTRY_crc32:
set_common_fields(dst->crc32, src);
- dst->crc32.csum = *((__le32 *) &src.csum.lo);
+ dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
break;
case BCH_EXTENT_ENTRY_crc64:
set_common_fields(dst->crc64, src);
dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = src.csum.lo;
- dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
+ dst->crc64.csum_lo = (u64 __force) src.csum.lo;
+ dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
break;
case BCH_EXTENT_ENTRY_crc128:
set_common_fields(dst->crc128, src);
return false;
}
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- bool ret = true;
- int err;
-
- end.offset += size;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
- bch2_trans_iter_put(&trans, iter);
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
return replicas;
}
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
- struct extent_ptr_decoded p)
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
{
- unsigned durability = 0;
- struct bch_dev *ca;
-
- if (p.ptr.cached)
+ if (p->ptr.cached)
return 0;
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ return p->has_ec
+ ? p->ec.redundancy + 1
+ : ca->mi.durability;
+}
- if (ca->mi.state != BCH_MEMBER_STATE_failed)
- durability = max_t(unsigned, durability, ca->mi.durability);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
- if (p.has_ec)
- durability += p.ec.redundancy;
+ return __extent_ptr_durability(ca, p);
+}
- return durability;
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+ if (ca->mi.state == BCH_MEMBER_STATE_failed)
+ return 0;
+
+ return __extent_ptr_durability(ca, p);
}
unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
unsigned durability = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- durability += bch2_extent_ptr_durability(c, p);
+ durability += bch2_extent_ptr_durability(c, &p);
return durability;
}
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
- unsigned target,
- unsigned nr_desired_replicas)
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
- if (target && extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra &&
- !bch2_dev_in_target(c, p.ptr.dev, target)) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
+ unsigned durability = 0;
- if (extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+ durability += bch2_extent_ptr_durability(c, &p);
- if (n && n <= extra) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
+ return durability;
}
void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
k->k.u64s -= extent_entry_u64s(entry);
}
-void bch2_bkey_append_ptr(struct bkey_i *k,
- struct bch_extent_ptr ptr)
-{
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
- memcpy((void *) &k->v + bkey_val_bytes(&k->k),
- &ptr,
- sizeof(ptr));
- k->u64s++;
- break;
- default:
- BUG();
- }
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
- union bch_extent_entry *dst,
- union bch_extent_entry *new)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
- memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
- k->k.u64s += extent_entry_u64s(new);
- memcpy(dst, new, extent_entry_bytes(new));
-}
-
void bch2_extent_ptr_decoded_append(struct bkey_i *k,
struct extent_ptr_decoded *p)
{
return i;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *dst, *src, *prev;
+ union bch_extent_entry *entry = to_entry(ptr), *next;
+ union bch_extent_entry *ret = entry;
bool drop_crc = true;
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
- src = extent_entry_next(to_entry(ptr));
- if (src != ptrs.end &&
- !extent_entry_is_crc(src))
- drop_crc = false;
-
- dst = to_entry(ptr);
- while ((prev = extent_entry_prev(ptrs, dst))) {
- if (extent_entry_is_ptr(prev))
+ for (next = extent_entry_next(entry);
+ next != ptrs.end;
+ next = extent_entry_next(next)) {
+ if (extent_entry_is_crc(next)) {
break;
-
- if (extent_entry_is_crc(prev)) {
- if (drop_crc)
- dst = prev;
+ } else if (extent_entry_is_ptr(next)) {
+ drop_crc = false;
break;
}
+ }
+
+ extent_entry_drop(k, entry);
+
+ while ((entry = extent_entry_prev(ptrs, entry))) {
+ if (extent_entry_is_ptr(entry))
+ break;
- dst = prev;
+ if ((extent_entry_is_crc(entry) && drop_crc) ||
+ extent_entry_is_stripe_ptr(entry)) {
+ ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_drop(k, entry);
+ }
}
- memmove_u64s_down(dst, src,
- (u64 *) ptrs.end - (u64 *) src);
- k.k->u64s -= (u64 *) src - (u64 *) dst;
+ return ret;
+}
- return dst;
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+ union bch_extent_entry *ret =
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+
+ /*
+ * If we deleted all the dirty pointers and there's still cached
+ * pointers, we could set the cached pointers to dirty if they're not
+ * stale - but to do that correctly we'd need to grab an open_bucket
+ * reference so that we don't race with bucket reuse:
+ */
+ if (have_dirty &&
+ !bch2_bkey_dirty_devs(k.s_c).nr) {
+ k.k->type = KEY_TYPE_error;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+ k.k->type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ }
+
+ return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+ if (ptr)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->dev == dev)
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
return false;
}
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+ if (k1.k->type != k2.k->type)
+ return false;
+
+ if (bkey_extent_is_direct_data(k1.k)) {
+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry1, *entry2;
+ struct extent_ptr_decoded p1, p2;
+
+ if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+ return false;
+
+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+ } else {
+ /* KEY_TYPE_deleted, etc. */
+ return true;
+ }
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+ struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+ union bch_extent_entry *entry2;
+ struct extent_ptr_decoded p2;
+
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return &entry2->ptr;
+
+ return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ union bch_extent_entry *ec = NULL;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (&entry->ptr == ptr) {
+ ptr->cached = true;
+ if (ec)
+ extent_entry_drop(k, ec);
+ return;
+ }
+
+ if (extent_entry_is_stripe_ptr(entry))
+ ec = entry;
+ else if (extent_entry_is_ptr(entry))
+ ec = NULL;
+ }
+
+ BUG();
+}
+
/*
* bch_extent_normalize - clean up an extent, dropping stale pointers etc.
*
ptr->cached &&
ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
- /* will only happen if all pointers were cached: */
- if (!bch2_bkey_nr_ptrs(k.s_c))
- k.k->type = KEY_TYPE_deleted;
-
return bkey_deleted(k.k);
}
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- const struct bch_extent_ptr *ptr;
- const struct bch_extent_stripe_ptr *ec;
- struct bch_dev *ca;
bool first = true;
+ if (c)
+ prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ptr = entry_to_ptr(entry);
- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "",
- ca && ptr_stale(ca, ptr)
- ? " stale" : "");
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (ca && ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
break;
+ }
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- crc.csum_type,
- crc.compression_type);
+ bch2_csum_types[crc.csum_type],
+ bch2_compression_types[crc.compression_type]);
break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ec = &entry->stripe_ptr;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr: {
+ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
- pr_buf(out, "ec: idx %llu block %u",
+ prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
+ }
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
default:
- pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
}
}
}
-static const char *extent_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
+static int extent_ptr_invalid(struct bch_fs *c,
+ struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata,
+ struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr2;
+ u64 bucket;
+ u32 bucket_offset;
struct bch_dev *ca;
+ int ret = 0;
- if (!bch2_dev_exists2(c, ptr->dev))
- return "pointer to invalid device";
+ if (!bch2_dev_exists2(c, ptr->dev)) {
+ /*
+ * If we're in the write path this key might have already been
+ * overwritten, and we could be seeing a device that doesn't
+ * exist anymore due to racing with device removal:
+ */
+ if (flags & BKEY_INVALID_WRITE)
+ return 0;
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!ca)
- return "pointer to invalid device";
+ bkey_fsck_err(c, err, ptr_to_invalid_device,
+ "pointer to invalid device (%u)", ptr->dev);
+ }
+ ca = bch_dev_bkey_exists(c, ptr->dev);
bkey_for_each_ptr(ptrs, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev)
- return "multiple pointers to same device";
-
- if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
- return "offset past end of device";
-
- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
- return "offset before first bucket";
-
- if (bucket_remainder(ca, ptr->offset) +
- size_ondisk > ca->mi.bucket_size)
- return "spans multiple buckets";
-
- return NULL;
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+ ptr_to_duplicate_device,
+ "multiple pointers to same device (%u)", ptr->dev);
+
+ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+
+ bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ ptr_after_last_bucket,
+ "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+ bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ ptr_spans_multiple_buckets,
+ "pointer spans multiple buckets (%u + %u > %u)",
+ bucket_offset, size_ondisk, ca->mi.bucket_size);
+fsck_err:
+ return ret;
}
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_devs_list devs;
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
unsigned size_ondisk = k.k->size;
- const char *reason;
unsigned nonce = UINT_MAX;
- unsigned i;
+ unsigned nr_ptrs = 0;
+ bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+ int ret = 0;
- if (k.k->type == KEY_TYPE_btree_ptr ||
- k.k->type == KEY_TYPE_btree_ptr_v2)
- size_ondisk = c->opts.btree_node_size;
+ if (bkey_is_btree_ptr(k.k))
+ size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- return "invalid extent entry type";
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+ extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
- if (k.k->type == KEY_TYPE_btree_ptr &&
- !extent_entry_is_ptr(entry))
- return "has non ptr field";
+ bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry), c, err,
+ btree_ptr_has_non_ptr,
+ "has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- reason = extent_ptr_invalid(c, k, &entry->ptr,
- size_ondisk, false);
- if (reason)
- return reason;
+ ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+ size_ondisk, false, err);
+ if (ret)
+ return ret;
+
+ bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+ ptr_cached_and_erasure_coded,
+ "cached, erasure coded ptr");
+
+ if (!entry->ptr.unwritten)
+ have_written = true;
+ else
+ have_unwritten = true;
+
+ have_ec = false;
+ crc_since_last_ptr = false;
+ nr_ptrs++;
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- if (crc.offset + crc.live_size >
- crc.uncompressed_size)
- return "checksum offset + key size > uncompressed size";
-
- size_ondisk = crc.compressed_size;
-
- if (!bch2_checksum_type_valid(c, crc.csum_type))
- return "invalid checksum type";
-
- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
- return "invalid compression type";
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+ ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+ ptr_crc_csum_type_unknown,
+ "invalid checksum type");
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+ ptr_crc_compression_type_unknown,
+ "invalid compression type");
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
else if (nonce != crc.offset + crc.nonce)
- return "incorrect nonce";
+ bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ "incorrect nonce");
}
+
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ ptr_crc_redundant,
+ "redundant crc entry");
+ crc_since_last_ptr = true;
+
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+
+ size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
+ bkey_fsck_err_on(have_ec, c, err,
+ ptr_stripe_redundant,
+ "redundant stripe entry");
+ have_ec = true;
break;
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ }
}
}
- devs = bch2_bkey_devs(k);
- bubble_sort(devs.devs, devs.nr, u8_cmp);
- for (i = 0; i + 1 < devs.nr; i++)
- if (devs.devs[i] == devs.devs[i + 1])
- return "multiple ptrs to same device";
-
- return NULL;
+ bkey_fsck_err_on(!nr_ptrs, c, err,
+ extent_ptrs_no_ptrs,
+ "no ptrs");
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+ extent_ptrs_too_many_ptrs,
+ "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+ bkey_fsck_err_on(have_written && have_unwritten, c, err,
+ extent_ptrs_written_and_unwritten,
+ "extent with unwritten and written ptrs");
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+ extent_ptrs_unwritten,
+ "has unwritten ptrs");
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ extent_ptrs_redundant_crc,
+ "redundant crc entry");
+ bkey_fsck_err_on(have_ec, c, err,
+ extent_ptrs_redundant_stripe,
+ "redundant stripe entry");
+fsck_err:
+ return ret;
}
void bch2_ptr_swab(struct bkey_s k)
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
+ case BCH_EXTENT_ENTRY_rebalance:
+ break;
+ }
+ }
+}
+
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
}
}
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
}
/* Generic extent code: */
int val_u64s_delta;
u64 sub;
- if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+ if (bkey_le(where, bkey_start_pos(k.k)))
return 0;
- EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+ EBUG_ON(bkey_gt(where, k.k->p));
sub = where.offset - bkey_start_offset(k.k);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
+ case BCH_EXTENT_ENTRY_rebalance:
+ break;
}
if (extent_entry_is_crc(entry))
int val_u64s_delta;
u64 len = 0;
- if (bkey_cmp(where, k.k->p) >= 0)
+ if (bkey_ge(where, k.k->p))
return 0;
- EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
+ EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
len = where.offset - bkey_start_offset(k.k);