X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fextents.c;h=1ab951c97a890e9f09b285dccf2a3836c3a57747;hb=9b08492bc733f3b1b1eefc87a4c52d9673432e42;hp=9efaa1ffa92f9402085c03571977407611618946;hpb=018de5aa899937a9dc3bc8cb9819cb218a59abf3;p=bcachefs-tools-debian diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 9efaa1f..1ab951c 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -27,227 +27,287 @@ #include -static void sort_key_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) { - i->k += __btree_node_offset_to_key(b, i->k)->u64s; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + bkey_for_each_ptr(p, ptr) + nr_ptrs++; - if (i->k == i->end) - *i = iter->data[--iter->used]; + return nr_ptrs; } -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ -#define key_sort_cmp(h, l, r) \ -({ \ - bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)) \ - \ - ?: (l).k - (r).k; \ -}) - -static inline bool should_drop_next_key(struct btree_node_iter_large *iter, - struct btree *b) +unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) { - struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; - struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); - - if (bkey_whiteout(k)) - return true; + unsigned nr_ptrs = 0; - if (iter->used < 2) - return false; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: { + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - if (iter->used > 2 && - key_sort_cmp(iter, r[0], r[1]) >= 0) - r++; + bkey_for_each_ptr(p, ptr) + nr_ptrs += !ptr->cached; + BUG_ON(!nr_ptrs); + break; + } + case KEY_TYPE_reservation: + nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older and - * should be dropped. - */ - return !bkey_cmp_packed(b, - __btree_node_offset_to_key(b, l->k), - __btree_node_offset_to_key(b, r->k)); + return nr_ptrs; } -struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) { - struct bkey_packed *out = dst->start; - struct btree_nr_keys nr; + unsigned i, durability = 0; + struct bch_dev *ca; + + if (p.ptr.cached) + return 0; - memset(&nr, 0, sizeof(nr)); + ca = bch_dev_bkey_exists(c, p.ptr.dev); - heap_resort(iter, key_sort_cmp); + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); - while (!bch2_btree_node_iter_large_end(iter)) { - if (!should_drop_next_key(iter, b)) { - struct bkey_packed *k = - __btree_node_offset_to_key(b, iter->data->k); + for (i = 0; i < p.ec_nr; i++) { + struct stripe *s = + genradix_ptr(&c->stripes[0], p.idx); - bkey_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } + if (WARN_ON(!s)) + continue; - sort_key_next(iter, b, iter->data); - heap_sift_down(iter, 0, key_sort_cmp); + durability = max_t(unsigned, durability, s->nr_redundant); } - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; + return durability; } -/* Common among btree and extent ptrs */ - -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_ptr *ptr; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, p); - return NULL; + return durability; } -bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) { - struct bch_extent_ptr *ptr; - bool dropped = false; + struct bch_dev_io_failures *i; - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; - if (dropped) - bch2_extent_drop_redundant_crcs(e); - return dropped; + return NULL; } -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) { - const struct bch_extent_ptr *ptr; + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; } - - return NULL; } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) +/* + * returns true if p1 is better than p2: + */ +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) { - const struct bch_extent_ptr *ptr; + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - if (dev_in_target(ca, target) && - (!ptr->cached || !ptr_stale(ca, ptr))) - return ptr; + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; } - return NULL; + if (force_reconstruct_read(c)) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; } -unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) +/* + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. + */ +int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; + struct bch_dev *ca; + int ret = 0; - extent_for_each_ptr(e, ptr) - nr_ptrs++; + if (k.k->type == KEY_TYPE_error) + return -EIO; - return nr_ptrs; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + /* + * If there are any dirty pointers it's an error if we can't + * read: + */ + if (!ret && !p.ptr.cached) + ret = -EIO; + + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + continue; + + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (force_reconstruct_read(c) && + !p.idx && p.ec_nr) + p.idx++; + + if (p.idx >= p.ec_nr + 1) + continue; + + if (ret > 0 && !ptr_better(c, p, *pick)) + continue; + + *pick = p; + ret = 1; + } + + return ret; } -unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) +void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - extent_for_each_ptr(e, ptr) - nr_ptrs += !ptr->cached; - break; + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - case BCH_RESERVATION: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->u64s++; break; + default: + BUG(); } +} - return nr_ptrs; +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, - const struct bch_extent_ptr *ptr) +/* extent specific utility code */ + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) { - struct bch_dev *ca; + const struct bch_extent_ptr *ptr; - if (ptr->cached) - return 0; + extent_for_each_ptr(e, ptr) + if (ptr->dev == dev) + return ptr; - ca = bch_dev_bkey_exists(c, ptr->dev); + return NULL; +} - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return 0; +const struct bch_extent_ptr * +bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return ca->mi.durability; + if (ca->mi.group && + ca->mi.group - 1 == group) + return ptr; + } + + return NULL; } -unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e) +const struct bch_extent_ptr * +bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) { const struct bch_extent_ptr *ptr; - unsigned durability = 0; extent_for_each_ptr(e, ptr) - durability += bch2_extent_ptr_durability(c, ptr); + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return ptr; - return durability; + return NULL; } unsigned bch2_extent_is_compressed(struct bkey_s_c k) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; unsigned ret = 0; switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc.compression_type != BCH_COMPRESSION_NONE && - crc.compressed_size < crc.live_size) - ret = max_t(unsigned, ret, crc.compressed_size); + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; + } } return ret; @@ -256,34 +316,67 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k) bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, struct bch_extent_ptr m, u64 offset) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) - if (ptr->dev == m.dev && - ptr->gen == m.gen && - (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) == + extent_for_each_ptr_decode(e, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == (s64) m.offset - offset) - return ptr; + return true; - return NULL; + return false; } -/* Doesn't cleanup redundant crcs */ -void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) { - EBUG_ON(ptr < &e.v->start->ptr || - ptr >= &extent_entry_last(e)->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; } -void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { - __bch2_extent_drop_ptr(e, ptr); - bch2_extent_drop_redundant_crcs(e); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + src = extent_entry_next(to_entry(ptr)); + if (src != ptrs.end && + !extent_entry_is_crc(src)) + drop_crc = false; + + dst = to_entry(ptr); + while ((prev = extent_entry_prev(ptrs, dst))) { + if (extent_entry_is_ptr(prev)) + break; + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; + break; + } + + dst = prev; + } + + memmove_u64s_down(dst, src, + (u64 *) ptrs.end - (u64 *) src); + k.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -325,38 +418,38 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, struct bch_extent_crc_unpacked n) { struct bch_extent_crc_unpacked u; - struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; union bch_extent_entry *i; + bool ret = false; /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) + if (!n.csum_type) { extent_for_each_crc(extent_i_to_s(e), u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { n = u; - break; + goto found; } - - if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n)) return false; - + } +found: BUG_ON(n.compression_type); BUG_ON(n.offset); BUG_ON(n.live_size != e->k.size); - bch2_extent_crc_append(e, n); restart_narrow_pointers: - extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u) - if (can_narrow_crc(u, n)) { - ptr->offset += u.offset; - extent_ptr_append(e, *ptr); - __bch2_extent_drop_ptr(extent_i_to_s(e), ptr); + extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + if (can_narrow_crc(p.crc, n)) { + bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(e, &p); + ret = true; goto restart_narrow_pointers; } - bch2_extent_drop_redundant_crcs(extent_i_to_s(e)); - return true; + return ret; } /* returns true if not equal */ @@ -373,138 +466,47 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, bch2_crc_cmp(l.csum, r.csum)); } -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) -{ - union bch_extent_entry *entry = e.v->start; - union bch_extent_crc *crc, *prev = NULL; - struct bch_extent_crc_unpacked u, prev_u = { 0 }; - - while (entry != extent_entry_last(e)) { - union bch_extent_entry *next = extent_entry_next(entry); - size_t crc_u64s = extent_entry_u64s(entry); - - if (!extent_entry_is_crc(entry)) - goto next; - - crc = entry_to_crc(entry); - u = bch2_extent_crc_unpack(e.k, crc); - - if (next == extent_entry_last(e)) { - /* crc entry with no pointers after it: */ - goto drop; - } - - if (extent_entry_is_crc(next)) { - /* no pointers before next crc entry: */ - goto drop; - } - - if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) { - /* identical to previous crc entry: */ - goto drop; - } - - if (!prev && - !u.csum_type && - !u.compression_type) { - /* null crc entry: */ - union bch_extent_entry *e2; - - extent_for_each_entry_from(e, e2, extent_entry_next(entry)) { - if (!extent_entry_is_ptr(e2)) - break; - - e2->ptr.offset += u.offset; - } - goto drop; - } - - prev = crc; - prev_u = u; -next: - entry = next; - continue; -drop: - memmove_u64s_down(crc, next, - (u64 *) extent_entry_last(e) - (u64 *) next); - e.k->u64s -= crc_u64s; - } - - EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); -} - -static bool should_drop_ptr(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) -{ - return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr); -} - -static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) -{ - struct bch_extent_ptr *ptr = &e.v->start->ptr; - bool dropped = false; - - while ((ptr = extent_ptr_next(e, ptr))) - if (should_drop_ptr(c, e.c, ptr)) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } else - ptr++; - - if (dropped) - bch2_extent_drop_redundant_crcs(e); -} - -bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k) -{ - return bch2_extent_normalize(c, k); -} - void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_ptr: - break; - } + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } - break; - } } } static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c_extent e, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr2; struct bch_dev *ca; @@ -516,7 +518,7 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, if (!ca) return "pointer to invalid device"; - extent_for_each_ptr(e, ptr2) + bkey_for_each_ptr(ptrs, ptr2) if (ptr != ptr2 && ptr->dev == ptr2->dev) return "multiple pointers to same device"; @@ -533,236 +535,148 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, return NULL; } -static size_t extent_print_ptrs(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c_extent e) +static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - char *out = buf, *end = buf + size; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; struct bch_dev *ca; bool first = true; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - extent_for_each_entry(e, entry) { + bkey_extent_entry_for_each(ptrs, entry) { if (!first) - p(" "); + pr_buf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); - break; case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; - p("ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : "", + ca && ptr_stale(ca, ptr) + ? " stale" : ""); + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + crc.csum_type, + crc.compression_type); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; + + pr_buf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); break; default: - p("(invalid extent entry %.16llx)", *((u64 *) entry)); - goto out; + pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; } first = false; } -out: - if (bkey_extent_is_cached(e.k)) - p(" cached"); -#undef p - return out - buf; } -static inline bool dev_latency_better(struct bch_fs *c, - const struct bch_extent_ptr *ptr1, - const struct bch_extent_ptr *ptr2) -{ - struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev); - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - - /* Pick at random, biased in favor of the faster device: */ - - return bch2_rand_range(l1 + l2) > l1; -} +/* Btree ptrs */ -static int extent_pick_read_device(struct bch_fs *c, - struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - struct bch_dev *ca; - int ret = 0; - - extent_for_each_ptr_crc(e, ptr, crc) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - if (ptr->cached && ptr_stale(ca, ptr)) - continue; - - if (avoid && test_bit(ptr->dev, avoid->d)) - continue; - - if (ret && !dev_latency_better(c, ptr, &pick->ptr)) - continue; - - *pick = (struct extent_pick_ptr) { - .ptr = *ptr, - .crc = crc, - }; - - ret = 1; - } - - return ret; -} - -/* Btree ptrs */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - if (bkey_extent_is_cached(k.k)) - return "cached"; - - if (k.k->size) - return "nonzero key size"; + const char *reason; if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - switch (k.k->type) { - case BCH_EXTENT: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; - const char *reason; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (extent_entry_is_crc(entry)) - return "has crc field"; - } - - extent_for_each_ptr(e, ptr) { - reason = extent_ptr_invalid(c, e, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; - } + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) + return "invalid extent entry type"; - return NULL; + if (!extent_entry_is_ptr(entry)) + return "has non ptr field"; } - default: - return "invalid value type"; + bkey_for_each_ptr(ptrs, ptr) { + reason = extent_ptr_invalid(c, k, ptr, + c->opts.btree_node_size, + true); + if (reason) + return reason; } + + return NULL; } void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - unsigned seq; const char *err; char buf[160]; struct bucket_mark mark; struct bch_dev *ca; - unsigned replicas = 0; - bool bad; - extent_for_each_ptr(e, ptr) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) - continue; + mark = ptr_bucket_mark(ca, ptr); err = "stale"; - if (ptr_stale(ca, ptr)) + if (gen_after(mark.gen, ptr->gen)) goto err; - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - err = "inconsistent"; - if (bad) + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) goto err; } - if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - bch2_fs_bug(c, - "btree key bad (replicas not marked in superblock):\n%s", - buf); - return; - } - return; err: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " - "gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.counter); + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); } -void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - char *out = buf, *end = buf + size; const char *invalid; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + bkey_ptrs_to_text(out, c, k); invalid = bch2_btree_ptr_invalid(c, k); if (invalid) - p(" invalid: %s", invalid); -#undef p -} - -int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) -{ - return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, pick); + pr_buf(out, " invalid: %s", invalid); } /* Extents */ -static bool __bch2_cut_front(struct bpos where, struct bkey_s k) +bool __bch2_cut_front(struct bpos where, struct bkey_s k) { u64 len = 0; @@ -780,7 +694,7 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) * cause offset to point to the next bucket: */ if (!len) - __set_bkey_deleted(k.k); + k.k->type = KEY_TYPE_deleted; else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); union bch_extent_entry *entry; @@ -801,6 +715,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) case BCH_EXTENT_ENTRY_crc128: entry->crc128.offset += e.k->size - len; break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } if (extent_entry_is_crc(entry)) @@ -813,11 +729,6 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) return true; } -bool bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - return __bch2_cut_front(where, bkey_i_to_s(k)); -} - bool bch2_cut_back(struct bpos where, struct bkey *k) { u64 len = 0; @@ -835,7 +746,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) k->size = len; if (!len) - __set_bkey_deleted(k); + k->type = KEY_TYPE_deleted; return true; } @@ -853,464 +764,178 @@ void bch2_key_resize(struct bkey *k, k->size = new_size; } -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static bool __extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) +static bool extent_i_save(struct btree *b, struct bkey_packed *dst, + struct bkey_i *src) { struct bkey_format *f = &b->format; struct bkey_i *dst_unpacked; - bool ret; - - if ((dst_unpacked = packed_to_bkey(dst))) { - dst_unpacked->k = *src; - ret = true; - } else { - ret = bch2_bkey_pack_key(dst, src, f); - } - - if (ret && iter) - bch2_verify_key_order(b, iter, dst); + struct bkey_packed tmp; - return ret; -} - -static void extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - BUG_ON(!__extent_save(b, iter, dst, src)); -} + if ((dst_unpacked = packed_to_bkey(dst))) + dst_unpacked->k = src->k; + else if (bch2_bkey_pack_key(&tmp, &src->k, f)) + memcpy_u64s(dst, &tmp, f->key_u64s); + else + return false; -/* - * If keys compare equal, compare by pointer order: - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -#define extent_sort_cmp(h, l, r) \ -({ \ - struct bkey _ul = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (l).k)); \ - struct bkey _ur = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - bkey_cmp(bkey_start_pos(&_ul), \ - bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ -}) - -static inline void extent_sort_sift(struct btree_node_iter_large *iter, - struct btree *b, size_t i) -{ - heap_sift_down(iter, i, extent_sort_cmp); + memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); + return true; } -static inline void extent_sort_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - sort_key_next(iter, b, i); - heap_sift_down(iter, i - iter->data, extent_sort_cmp); -} +static bool bch2_extent_merge_inline(struct bch_fs *, + struct btree_iter *, + struct bkey_packed *, + struct bkey_packed *, + bool); -static void extent_sort_append(struct bch_fs *c, - struct btree *b, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, - struct bkey_packed *k) +static void verify_extent_nonoverlapping(struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) { - struct bkey_format *f = &b->format; - BKEY_PADDED(k) tmp; - - if (bkey_whiteout(k)) - return; - - bch2_bkey_unpack(b, &tmp.k, k); - - if (*prev && - bch2_extent_merge(c, b, (void *) *prev, &tmp.k)) - return; - - if (*prev) { - bch2_bkey_pack(*prev, (void *) *prev, f); +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); } +#endif - bkey_copy(*prev, &tmp.k); +#endif } -struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) +static void verify_modified_extent(struct btree_iter *iter, + struct bkey_packed *k) { - struct bkey_format *f = &b->format; - struct btree_node_iter_set *_l = iter->data, *_r; - struct bkey_packed *prev = NULL, *out, *lk, *rk; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - heap_resort(iter, extent_sort_cmp); - - while (!bch2_btree_node_iter_large_end(iter)) { - lk = __btree_node_offset_to_key(b, _l->k); - - if (iter->used == 1) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - _r = iter->data + 1; - if (iter->used > 2 && - extent_sort_cmp(iter, _r[0], _r[1]) >= 0) - _r++; - - rk = __btree_node_offset_to_key(b, _r->k); - - l = __bkey_disassemble(b, lk, &l_unpacked); - r = __bkey_disassemble(b, rk, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_sort_next(iter, b, _r); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - sort_key_next(iter, b, _r); - } else { - __bch2_cut_front(l.k->p, r); - extent_save(b, NULL, rk, r.k); - } - - extent_sort_sift(iter, b, _r - iter->data); - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_reassemble(&tmp.k, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); - - __bch2_cut_front(r.k->p, l); - extent_save(b, NULL, lk, l.k); - - extent_sort_sift(iter, b, 0); - - extent_sort_append(c, b, &nr, dst->start, &prev, - bkey_to_packed(&tmp.k)); - } else { - bch2_cut_back(bkey_start_pos(r.k), l.k); - extent_save(b, NULL, lk, l.k); - } - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = dst->start; - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; + bch2_btree_iter_verify(iter, iter->l[0].b); + bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); } -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - struct bch_fs_usage stats; +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter; + struct bkey_packed *k; - /* for deleting: */ - struct bkey_i whiteout; - bool do_journal; - bool deleting; -}; + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); -static void bch2_add_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - struct bch_fs *c = s->trans->c; - struct btree *b = s->insert->iter->l[0].b; + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(l->b, &l->iter, insert); - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); + node_iter = l->iter; + k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) + return; - if (!sectors) + node_iter = l->iter; + k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) return; - bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq, 0); -} + k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); -static void bch2_subtract_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - bch2_add_sectors(s, k, offset, -sectors); + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); + bch2_btree_iter_verify(iter, l->b); } -/* These wrappers subtract exactly the sectors that we're removing from @k */ -static void bch2_cut_subtract_back(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) +static inline struct bpos +bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter) { - bch2_subtract_sectors(s, k.s_c, where.offset, - k.k->p.offset - where.offset); - bch2_cut_back(where, k.k); -} + struct btree *b = iter->l[0].b; -static void bch2_cut_subtract_front(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), - where.offset - bkey_start_offset(k.k)); - __bch2_cut_front(where, k); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0); + + return bpos_min(k->k.p, b->key.k.p); } -static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) +void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { - if (k.k->size) - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - k.k->size = 0; - __set_bkey_deleted(k.k); + bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k); } -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) - -static enum btree_insert_ret -extent_insert_should_stop(struct extent_insert_state *s) +bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { - struct btree *b = s->insert->iter->l[0].b; - - /* - * Check if we have sufficient space in both the btree node and the - * journal reservation: - * - * Each insert checks for room in the journal entry, but we check for - * room in the btree node up-front. In the worst case, bkey_cmpxchg() - * will insert two keys, and one iteration of this room will insert one - * key, so we need room for three keys. - */ - if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; - else if (!journal_res_insert_fits(s->trans, s->insert)) - return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ - else - return BTREE_INSERT_OK; + return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p); } -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *trans, + struct btree_insert_entry *insert, + unsigned *u64s) { - struct btree_iter_level *l = &iter->l[0]; - struct bset_tree *t = bset_tree_last(l->b); - struct bkey_packed *where = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); - struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where); - struct bkey_packed *next_live_key = where; - unsigned clobber_u64s; - - if (prev) - where = bkey_next(prev); - - while (next_live_key != btree_bkey_last(l->b, t) && - bkey_deleted(next_live_key)) - next_live_key = bkey_next(next_live_key); + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; + enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_s_c k; + int sectors; /* - * Everything between where and next_live_key is now deleted keys, and - * is overwritten: + * We avoid creating whiteouts whenever possible when deleting, but + * those optimizations mean we may potentially insert two whiteouts + * instead of one (when we overlap with the front of one extent and the + * back of another): */ - clobber_u64s = (u64 *) next_live_key - (u64 *) where; - - if (prev && - bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) - goto drop_deleted_keys; - - if (next_live_key != btree_bkey_last(l->b, t) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), - next_live_key, false)) - goto drop_deleted_keys; - - bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where, - clobber_u64s, where->u64s); - return; -drop_deleted_keys: - bch2_bset_delete(l->b, where, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - where, clobber_u64s, 0); -} - -static void extent_insert_committed(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = !s->deleting - ? s->insert->k - : &s->whiteout; - BKEY_PADDED(k) split; - - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - - if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) - return; - - if (s->deleting && !s->do_journal) { - bch2_cut_front(s->committed, insert); - goto done; - } - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - bkey_copy(&split.k, insert); - - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bch2_extent_is_compressed(bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch2_cut_subtract_back(s, s->committed, - bkey_i_to_s(&split.k)); - bch2_cut_front(s->committed, insert); - bch2_add_sectors(s, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size); - } else { - bch2_cut_back(s->committed, &split.k.k); - bch2_cut_front(s->committed, insert); - } - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k)); - - bch2_btree_journal_key(s->trans, iter, &split.k); - - if (!s->deleting) - extent_bset_insert(c, iter, &split.k); -done: - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - - insert->k.needs_whiteout = false; - s->do_journal = false; - s->trans->did_work = true; -} - -static enum btree_insert_ret -__extent_insert_advance_pos(struct extent_insert_state *s, - struct bpos next_pos, - struct bkey_s_c k) -{ - struct extent_insert_hook *hook = s->trans->hook; - enum btree_insert_ret ret; - - if (hook) - ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); - else - ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - - if (ret == BTREE_INSERT_OK) - s->committed = next_pos; - - return ret; -} - -/* - * Update iter->pos, marking how much of @insert we've processed, and call hook - * fn: - */ -static enum btree_insert_ret -extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) -{ - struct btree *b = s->insert->iter->l[0].b; - struct bpos next_pos = bpos_min(s->insert->k->k.p, - k.k ? k.k->p : b->key.k.p); - enum btree_insert_ret ret; - - if (race_fault()) - return BTREE_INSERT_NEED_TRAVERSE; - - /* hole? */ - if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null); - if (ret != BTREE_INSERT_OK) - return ret; - } + if (bkey_whiteout(&insert->k->k)) + *u64s += BKEY_U64s; - /* avoid redundant calls to hook fn: */ - if (!bkey_cmp(s->committed, next_pos)) + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) return BTREE_INSERT_OK; - return __extent_insert_advance_pos(s, next_pos, k); -} + k = bkey_disassemble(l->b, _k, &unpacked); -static enum btree_insert_ret -extent_insert_check_split_compressed(struct extent_insert_state *s, - struct bkey_s_c k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - unsigned sectors; + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + /* account for having to split existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bch2_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - - if (s->trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; - switch (bch2_disk_reservation_add(c, - s->trans->disk_res, - sectors * bch2_extent_nr_dirty_ptrs(k), - flags)) { + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, + sectors, flags)) { case 0: break; case -ENOSPC: return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; default: BUG(); } @@ -1319,78 +944,60 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, return BTREE_INSERT_OK; } -static enum btree_insert_ret -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, +static void +extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - enum btree_insert_ret ret; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ - bch2_cut_subtract_front(s, insert->k.p, k); + __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ - bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); + bch2_cut_back(bkey_start_pos(&insert->k), k.k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); /* * As the auxiliary tree is indexed by the end of the * key and we've just changed the end, update the * auxiliary tree. */ - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { - struct bpos orig_pos = k.k->p; - /* The insert key completely covers k, invalidate k */ if (!bkey_whiteout(k.k)) - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - - bch2_drop_subtract(s, k); - k.k->p = bkey_start_pos(&insert->k); - if (!__extent_save(b, node_iter, _k, k.k)) { - /* - * Couldn't repack: we aren't necessarily able - * to repack if the new key is outside the range - * of the old extent, so we have to split - * @insert: - */ - k.k->p = orig_pos; - extent_save(b, node_iter, _k, k.k); - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret != BTREE_INSERT_OK) - return ret; - - extent_insert_committed(s); - /* - * We split and inserted upto at k.k->p - that - * has to coincide with iter->pos, so that we - * don't have anything more we have to insert - * until we recheck our journal reservation: - */ - EBUG_ON(bkey_cmp(s->committed, k.k->p)); + btree_account_key_drop(l->b, _k); + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; + + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, u64s, 0); + bch2_btree_iter_verify(iter, l->b); } else { - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + extent_save(l->b, _k, k.k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); } break; @@ -1412,172 +1019,111 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, * what k points to) */ bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); + split.k.k.needs_whiteout |= bkey_written(l->b, _k); bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); - bch2_cut_subtract_front(s, insert->k.p, k); + __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); - bch2_add_sectors(s, bkey_i_to_s_c(&split.k), - bkey_start_offset(&split.k.k), - split.k.k.size); extent_bset_insert(c, iter, &split.k); break; } } - - return BTREE_INSERT_OK; } -static enum btree_insert_ret -__bch2_delete_fixup_extent(struct extent_insert_state *s) +struct extent_insert_state { + struct bkey_i whiteout; + bool update_journal; + bool update_btree; + bool deleting; +}; + +static void __bch2_insert_fixup_extent(struct bch_fs *c, + struct btree_iter *iter, + struct bkey_i *insert, + struct extent_insert_state *s) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; struct bkey_packed *_k; struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - s->whiteout = *insert; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; - if (bkey_whiteout(k.k)) { - s->committed = bpos_min(insert->k.p, k.k->p); + if (!bkey_whiteout(k.k)) + s->update_journal = true; + + if (!s->update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &s->whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); goto next; } - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) + /* + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: + */ + if (s->deleting && + !s->update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + _k->type = KEY_TYPE_discard; + reserve_whiteout(l->b, _k); + } break; + } - s->do_journal = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL) { - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - _k->type = KEY_TYPE_DISCARD; - reserve_whiteout(b, t, _k); - } else if (k.k->needs_whiteout || - bset_written(b, bset(b, t))) { - struct bkey_i discard = *insert; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - bch2_cut_front(bkey_start_pos(k.k), &discard); - break; - case BCH_EXTENT_OVERLAP_BACK: - bch2_cut_back(k.k->p, &discard.k); - break; - default: - break; - } + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { + insert->k.needs_whiteout = true; + s->update_btree = true; + } - discard.k.needs_whiteout = true; + if (s->update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(l->b, _k); + _k->needs_whiteout = false; + } - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); + extent_squash(c, iter, insert, _k, k, overlap); - extent_bset_insert(c, iter, &discard); - } else { - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - } + if (!s->update_btree) + bch2_cut_front(cur_end, insert); next: - bch2_cut_front(s->committed, insert); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) + break; } - return ret; -} + /* + * may have skipped past some deleted extents greater than the insert + * key, before we got to a non deleted extent and knew we could bail out + * rewind the iterator a bit if necessary: + */ + { + struct btree_node_iter node_iter = l->iter; -static enum btree_insert_ret -__bch2_insert_fixup_extent(struct extent_insert_state *s) -{ - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; - - if (!k.k->size) - goto squash; - - /* - * Only call advance pos & call hook for nonzero size extents: - */ - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) - break; - - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) - insert->k.needs_whiteout = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); - _k->needs_whiteout = false; - } -squash: - ret = extent_squash(s, insert, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) - break; - } - - return ret; -} + while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && + bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0) + l->iter = node_iter; + } +} /** * bch_extent_insert_fixup - insert a new extent and deal with overlaps @@ -1618,165 +1164,122 @@ squash: * If the end of iter->pos is not the same as the end of insert, then * key insertion needs to continue/be retried. */ -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) +void bch2_insert_fixup_extent(struct btree_trans *trans, + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - enum btree_insert_ret ret = BTREE_INSERT_OK; - + struct btree_iter *iter = insert->iter; struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = insert->iter->pos, + .whiteout = *insert->k, + .update_journal = !bkey_whiteout(&insert->k->k), + .update_btree = !bkey_whiteout(&insert->k->k), .deleting = bkey_whiteout(&insert->k->k), }; + BKEY_PADDED(k) tmp; EBUG_ON(iter->level); - EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); - - /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: - */ + EBUG_ON(!insert->k->k.size); EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - if (!s.deleting && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - - ret = !s.deleting - ? __bch2_insert_fixup_extent(&s) - : __bch2_delete_fixup_extent(&s); + __bch2_insert_fixup_extent(c, iter, insert->k, &s); - if (ret == BTREE_INSERT_OK && - bkey_cmp(s.committed, insert->k->k.p) < 0) - ret = extent_insert_advance_pos(&s, bkey_s_c_null); + bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p); - extent_insert_committed(&s); + if (s.update_btree) { + bkey_copy(&tmp.k, insert->k); - if (s.deleting) - bch2_cut_front(iter->pos, insert->k); + if (s.deleting) + tmp.k.k.type = KEY_TYPE_discard; +#if 0 + /* disabled due to lock recursion - mark_lock: */ + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, iter->l[0].b, + bkey_i_to_s_c(&tmp.k)); +#endif + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - /* - * Subtract any remaining sectors from @insert, if we bailed out early - * and didn't fully insert @insert: - */ - if (!s.deleting && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - insert->k->k.size) - bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); + extent_bset_insert(c, iter, &tmp.k); + } - bch2_fs_usage_apply(c, &s.stats, trans->disk_res, - gc_pos_btree_node(b)); + if (s.update_journal) { + bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); + if (s.deleting) + tmp.k.k.type = KEY_TYPE_discard; - if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0), - "ret %u insert->k.size %u", ret, insert->k->k.size); + bch2_btree_journal_key(trans, iter, &tmp.k); + } - return ret; + bch2_cut_front(insert->k->k.p, insert->k); } const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - if (!k.k->size) - return "zero key size"; + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const struct bch_extent_ptr *ptr; + unsigned size_ondisk = e.k->size; + const char *reason; + unsigned nonce = UINT_MAX; - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; + if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX) + return "value too big"; - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; + extent_for_each_entry(e, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) + return "invalid extent entry type"; - if (extent_entry_is_crc(entry)) { - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; + reason = extent_ptr_invalid(c, e.s_c, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - size_ondisk = crc.compressed_size; + if (crc.offset + e.k->size > + crc.uncompressed_size) + return "checksum offset + key size > uncompressed size"; - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; + size_ondisk = crc.compressed_size; - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; + if (!bch2_checksum_type_valid(c, crc.csum_type)) + return "invalid checksum type"; - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - } else { - ptr = entry_to_ptr(entry); + if (crc.compression_type >= BCH_COMPRESSION_NR) + return "invalid compression type"; - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + return "incorrect nonce"; } + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } - - return NULL; } - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; - } - - default: - return "invalid value type"; - } + return NULL; } -static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, - struct bkey_s_c_extent e) +void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) { - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket_mark mark; - unsigned seq, stale; + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; char buf[160]; - bool bad; - unsigned replicas = 0; /* * XXX: we should be doing most/all of these checks at startup time, @@ -1787,108 +1290,54 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; - - stale = 0; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - /* between mark and bucket gen */ - smp_rmb(); - - stale = ptr_stale(ca, ptr); - - bch2_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); - - bch2_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); - - if (stale) - break; - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_USER || - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors)); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - if (bad) - goto bad_ptr; - } + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - if (replicas > BCH_REPLICAS_MAX) { - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; - } - if (!bkey_extent_is_cached(e.k) && - !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (replicas not marked in superblock):\n%s", - buf); - return; + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; + + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); + + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); } - - return; - -bad_ptr: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i type %u", buf, - PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); - return; } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) +void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); - break; - case BCH_RESERVATION: - break; - default: - BUG(); - } -} - -void bch2_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; const char *invalid; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + bkey_ptrs_to_text(out, c, k); invalid = bch2_extent_invalid(c, k); if (invalid) - p(" invalid: %s", invalid); -#undef p + pr_buf(out, " invalid: %s", invalid); } static void bch2_extent_crc_init(union bch_extent_crc *crc, @@ -1943,25 +1392,45 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc, void bch2_extent_crc_append(struct bkey_i_extent *e, struct bch_extent_crc_unpacked new) { - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; + bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); + __extent_entry_push(e); +} - BUG_ON(new.compressed_size > new.uncompressed_size); - BUG_ON(new.live_size != e->k.size); - BUG_ON(!new.compressed_size || !new.uncompressed_size); +static inline void __extent_entry_insert(struct bkey_i_extent *e, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); - /* - * Look up the last crc entry, so we can check if we need to add - * another: - */ - extent_for_each_crc(extent_i_to_s(e), crc, i) - ; + memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + e->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} - if (!bch2_crc_unpacked_cmp(crc, new)) - return; +void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, + struct extent_ptr_decoded *p) +{ + struct bch_extent_crc_unpacked crc; + union bch_extent_entry *pos; + unsigned i; - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + extent_for_each_crc(extent_i_to_s(e), crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(e, p->crc); + pos = extent_entry_last(extent_i_to_s(e)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ptr)); + + for (i = 0; i < p->ec_nr; i++) { + p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + } } /* @@ -1974,41 +1443,17 @@ void bch2_extent_crc_append(struct bkey_i_extent *e, */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bkey_s_extent e; - - switch (k.k->type) { - case KEY_TYPE_ERROR: - return false; - - case KEY_TYPE_DELETED: - case KEY_TYPE_COOKIE: - return true; - - case KEY_TYPE_DISCARD: - return bversion_zero(k.k->version); - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_to_extent(k); + struct bch_extent_ptr *ptr; - bch2_extent_drop_stale(c, e); + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - if (!bkey_val_u64s(e.k)) { - if (bkey_extent_is_cached(e.k)) { - k.k->type = KEY_TYPE_DISCARD; - if (bversion_zero(k.k->version)) - return true; - } else { - k.k->type = KEY_TYPE_ERROR; - } - } + /* will only happen if all pointers were cached: */ + if (!bkey_val_u64s(k.k)) + k.k->type = KEY_TYPE_deleted; - return false; - case BCH_RESERVATION: - return false; - default: - BUG(); - } + return false; } void bch2_extent_mark_replicas_cached(struct bch_fs *c, @@ -2016,140 +1461,66 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, unsigned target, unsigned nr_desired_replicas) { - struct bch_extent_ptr *ptr; - int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && - !dev_in_target(c->devs[ptr->dev], target)) { - ptr->cached = true; + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; extra -= n; } } if (extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { - ptr->cached = true; + entry->ptr.cached = true; extra -= n; } } } -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) -{ - int ret; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_COOKIE: - return 0; - - case KEY_TYPE_ERROR: - return -EIO; - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - ret = extent_pick_read_device(c, bkey_s_c_to_extent(k), - avoid, pick); - - if (!ret && !bkey_extent_is_cached(k.k)) - ret = -EIO; - - return ret; - - case BCH_RESERVATION: - return 0; - - default: - BUG(); - } -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, +enum merge_result bch2_extent_merge(struct bch_fs *c, struct bkey_i *l, struct bkey_i *r) { - struct bkey_s_extent el, er; + struct bkey_s_extent el = bkey_i_to_s_extent(l); + struct bkey_s_extent er = bkey_i_to_s_extent(r); union bch_extent_entry *en_l, *en_r; - if (key_merging_disabled(c)) + if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k)) return BCH_MERGE_NOMERGE; - /* - * Generic header checks - * Assumes left and right are in order - * Left and right must be exactly aligned - */ - - if (l->k.u64s != r->k.u64s || - l->k.type != r->k.type || - bversion_cmp(l->k.version, r->k.version) || - bkey_cmp(l->k.p, bkey_start_pos(&r->k))) - return BCH_MERGE_NOMERGE; - - switch (l->k.type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_ERROR: - /* These types are mergeable, and no val to check */ - break; - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - el = bkey_i_to_s_extent(l); - er = bkey_i_to_s_extent(r); - - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - struct bch_dev *ca; - - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); - - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) - return BCH_MERGE_NOMERGE; + extent_for_each_entry(el, en_l) { + struct bch_extent_ptr *lp, *rp; + struct bch_dev *ca; - lp = &en_l->ptr; - rp = &en_r->ptr; + en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + if ((extent_entry_type(en_l) != + extent_entry_type(en_r)) || + !extent_entry_is_ptr(en_l)) + return BCH_MERGE_NOMERGE; - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + lp = &en_l->ptr; + rp = &en_r->ptr; - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - } + if (lp->offset + el.k->size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; - break; - case BCH_RESERVATION: { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) return BCH_MERGE_NOMERGE; - break; - } - default: - return BCH_MERGE_NOMERGE; } l->k.needs_whiteout |= r->k.needs_whiteout; @@ -2168,130 +1539,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, return BCH_MERGE_MERGE; } -static void extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - - BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); - - /* - * We don't want the bch2_verify_key_order() call in extent_save(), - * because we may be out of order with deleted keys that are about to be - * removed by extent_bset_insert() - */ - - if ((dst_unpacked = packed_to_bkey(dst))) - bkey_copy(dst_unpacked, src); - else - BUG_ON(!bch2_bkey_pack(dst, src, f)); -} - -static bool extent_merge_one_overlapping(struct btree_iter *iter, - struct bpos new_pos, - struct bset_tree *t, - struct bkey_packed *k, struct bkey uk, - bool check, bool could_pack) -{ - struct btree_iter_level *l = &iter->l[0]; - - BUG_ON(!bkey_deleted(k)); - - if (check) { - return !bkey_packed(k) || could_pack; - } else { - uk.p = new_pos; - extent_save(l->b, &l->iter, k, &uk); - bch2_bset_fix_invalidated_key(l->b, t, k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - k, k->u64s, k->u64s); - return true; - } -} - -static bool extent_merge_do_overlapping(struct btree_iter *iter, - struct bkey *m, bool back_merge) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bset_tree *t; - struct bkey_packed *k; - struct bkey uk; - struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); - bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); - bool check = true; - - /* - * @m is the new merged extent: - * - * The merge took place in the last bset; we know there can't be any 0 - * size extents overlapping with m there because if so they would have - * been between the two extents we merged. - * - * But in the other bsets, we have to check for and fix such extents: - */ -do_fixup: - for_each_bset(b, t) { - if (t == bset_tree_last(b)) - break; - - /* - * if we don't find this bset in the iterator we already got to - * the end of that bset, so start searching from the end. - */ - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - if (!k) - continue; - - if (back_merge) { - /* - * Back merge: 0 size extents will be before the key - * that was just inserted (and thus the iterator - * position) - walk backwards to find them - */ - for (; - k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(m)) > 0); - k = bch2_bkey_prev_all(b, t, k)) { - if (bkey_cmp(uk.p, m->p) >= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } else { - /* Front merge - walk forwards */ - for (; - k != btree_bkey_last(b, t) && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, m->p) < 0); - k = bkey_next(k)) { - if (bkey_cmp(uk.p, - bkey_start_pos(m)) <= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } - } - - if (check) { - check = false; - goto do_fixup; - } - - return true; -} - /* * When merging an extent that we're inserting into a btree node, the new merged * extent could overlap with an existing 0 size extent - if we don't fix that, @@ -2308,13 +1555,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, { struct btree *b = iter->l[0].b; struct btree_node_iter *node_iter = &iter->l[0].iter; - const struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *m; - BKEY_PADDED(k) li; - BKEY_PADDED(k) ri; - struct bkey_i *mi; - struct bkey tmp; + BKEY_PADDED(k) li, ri; + struct bkey_packed *m = back_merge ? l : r; + struct bkey_i *mi = back_merge ? &li.k : &ri.k; + struct bset_tree *t = bch2_bkey_to_bset(b, m); + enum merge_result ret; + + EBUG_ON(bkey_written(b, m)); /* * We need to save copies of both l and r, because we might get a @@ -2323,65 +1570,58 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, bch2_bkey_unpack(b, &li.k, l); bch2_bkey_unpack(b, &ri.k, r); - m = back_merge ? l : r; - mi = back_merge ? &li.k : &ri.k; + ret = bch2_bkey_merge(c, &li.k, &ri.k); + if (ret == BCH_MERGE_NOMERGE) + return false; - /* l & r should be in last bset: */ - EBUG_ON(bch2_bkey_to_bset(b, m) != t); + /* + * check if we overlap with deleted extents - would break the sort + * order: + */ + if (back_merge) { + struct bkey_packed *n = bkey_next(m); - switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { - case BCH_MERGE_NOMERGE: - return false; - case BCH_MERGE_PARTIAL: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) + if (n != btree_bkey_last(b, t) && + bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && + bkey_deleted(n)) return false; + } else if (ret == BCH_MERGE_MERGE) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + if (prev && + bkey_cmp_left_packed_byval(b, prev, + bkey_start_pos(&li.k.k)) > 0) return false; + } - extent_i_save(b, m, mi); - bch2_bset_fix_invalidated_key(b, t, m); - - /* - * Update iterator to reflect what we just inserted - otherwise, - * the iter_fix() call is going to put us _before_ the key we - * just partially merged with: - */ - if (back_merge) - bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); - - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); + if (ret == BCH_MERGE_PARTIAL) { + if (!extent_i_save(b, m, mi)) + return false; if (!back_merge) bkey_copy(packed_to_bkey(l), &li.k); else bkey_copy(packed_to_bkey(r), &ri.k); - return false; - case BCH_MERGE_MERGE: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) - return false; - - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + } else { + if (!extent_i_save(b, m, &li.k)) return false; + } - extent_i_save(b, m, &li.k); - bch2_bset_fix_invalidated_key(b, t, m); + bch2_bset_fix_invalidated_key(b, m); + bch2_btree_node_iter_fix(iter, b, node_iter, + m, m->u64s, m->u64s); + verify_modified_extent(iter, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); - return true; - default: - BUG(); - } + return ret == BCH_MERGE_MERGE; } -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) { struct btree_iter iter; struct bpos end = pos; struct bkey_s_c k; - int ret = 0; + bool ret = true; end.offset += size; @@ -2390,8 +1630,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; + if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { + ret = false; break; } } @@ -2399,3 +1639,77 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) return ret; } + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + break; + } + case KEY_TYPE_reservation: + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return ret; +} + +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + pr_buf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} + +enum merge_result bch2_reservation_merge(struct bch_fs *c, + struct bkey_i *l, struct bkey_i *r) +{ + struct bkey_i_reservation *li = bkey_i_to_reservation(l); + struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + + if (li->v.generation != ri->v.generation || + li->v.nr_replicas != ri->v.nr_replicas) + return BCH_MERGE_NOMERGE; + + l->k.needs_whiteout |= r->k.needs_whiteout; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { + bch2_key_resize(&l->k, KEY_SIZE_MAX); + bch2_cut_front(l->k.p, r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(&l->k, l->k.size + r->k.size); + + return BCH_MERGE_MERGE; +}