2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Code for managing the extent btree and dynamically updating the writeback
9 #include "bkey_methods.h"
11 #include "btree_update.h"
12 #include "btree_update_interior.h"
26 #include <trace/events/bcachefs.h>
28 static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
29 struct bkey_i *, struct bkey_i *);
31 static void sort_key_next(struct btree_node_iter_large *iter,
33 struct btree_node_iter_set *i)
35 i->k += __btree_node_offset_to_key(b, i->k)->u64s;
38 *i = iter->data[--iter->used];
42 * Returns true if l > r - unless l == r, in which case returns true if l is
45 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
46 * equal in different sets, we have to process them newest to oldest.
48 #define key_sort_cmp(h, l, r) \
51 __btree_node_offset_to_key(b, (l).k), \
52 __btree_node_offset_to_key(b, (r).k)) \
57 static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
60 struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
61 struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
70 key_sort_cmp(iter, r[0], r[1]) >= 0)
74 * key_sort_cmp() ensures that when keys compare equal the older key
75 * comes first; so if l->k compares equal to r->k then l->k is older and
78 return !bkey_cmp_packed(b,
79 __btree_node_offset_to_key(b, l->k),
80 __btree_node_offset_to_key(b, r->k));
83 struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
85 struct btree_node_iter_large *iter)
87 struct bkey_packed *out = dst->start;
88 struct btree_nr_keys nr;
90 memset(&nr, 0, sizeof(nr));
92 heap_resort(iter, key_sort_cmp);
94 while (!bch2_btree_node_iter_large_end(iter)) {
95 if (!should_drop_next_key(iter, b)) {
96 struct bkey_packed *k =
97 __btree_node_offset_to_key(b, iter->data->k);
100 btree_keys_account_key_add(&nr, 0, out);
101 out = bkey_next(out);
104 sort_key_next(iter, b, iter->data);
105 heap_sift_down(iter, 0, key_sort_cmp);
108 dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
112 /* Common among btree and extent ptrs */
114 const struct bch_extent_ptr *
115 bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
117 const struct bch_extent_ptr *ptr;
119 extent_for_each_ptr(e, ptr)
126 bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
128 struct bch_extent_ptr *ptr;
129 bool dropped = false;
131 extent_for_each_ptr_backwards(e, ptr)
132 if (ptr->dev == dev) {
133 __bch2_extent_drop_ptr(e, ptr);
138 bch2_extent_drop_redundant_crcs(e);
142 const struct bch_extent_ptr *
143 bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
145 const struct bch_extent_ptr *ptr;
147 extent_for_each_ptr(e, ptr) {
148 struct bch_dev *ca = c->devs[ptr->dev];
151 ca->mi.group - 1 == group)
158 const struct bch_extent_ptr *
159 bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
161 const struct bch_extent_ptr *ptr;
163 extent_for_each_ptr(e, ptr)
164 if (dev_in_target(c->devs[ptr->dev], target))
170 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
172 const struct bch_extent_ptr *ptr;
173 unsigned nr_ptrs = 0;
175 extent_for_each_ptr(e, ptr)
181 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
183 struct bkey_s_c_extent e;
184 const struct bch_extent_ptr *ptr;
185 unsigned nr_ptrs = 0;
189 case BCH_EXTENT_CACHED:
190 e = bkey_s_c_to_extent(k);
192 extent_for_each_ptr(e, ptr)
193 nr_ptrs += !ptr->cached;
196 case BCH_RESERVATION:
197 nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
204 unsigned bch2_extent_ptr_durability(struct bch_fs *c,
205 const struct bch_extent_ptr *ptr)
212 ca = bch_dev_bkey_exists(c, ptr->dev);
214 if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
217 return ca->mi.durability;
220 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
222 const struct bch_extent_ptr *ptr;
223 unsigned durability = 0;
225 extent_for_each_ptr(e, ptr)
226 durability += bch2_extent_ptr_durability(c, ptr);
231 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
233 struct bkey_s_c_extent e;
234 const struct bch_extent_ptr *ptr;
235 struct bch_extent_crc_unpacked crc;
240 case BCH_EXTENT_CACHED:
241 e = bkey_s_c_to_extent(k);
243 extent_for_each_ptr_crc(e, ptr, crc)
245 crc.compression_type != BCH_COMPRESSION_NONE &&
246 crc.compressed_size < crc.live_size)
247 ret = max_t(unsigned, ret, crc.compressed_size);
253 bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
254 struct bch_extent_ptr m, u64 offset)
256 const struct bch_extent_ptr *ptr;
257 struct bch_extent_crc_unpacked crc;
259 extent_for_each_ptr_crc(e, ptr, crc)
260 if (ptr->dev == m.dev &&
262 (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
263 (s64) m.offset - offset)
269 /* Doesn't cleanup redundant crcs */
270 void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
272 EBUG_ON(ptr < &e.v->start->ptr ||
273 ptr >= &extent_entry_last(e)->ptr);
274 EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
275 memmove_u64s_down(ptr, ptr + 1,
276 (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
277 e.k->u64s -= sizeof(*ptr) / sizeof(u64);
280 void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
282 __bch2_extent_drop_ptr(e, ptr);
283 bch2_extent_drop_redundant_crcs(e);
286 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
287 struct bch_extent_crc_unpacked n)
289 return !u.compression_type &&
291 u.uncompressed_size > u.live_size &&
292 bch2_csum_type_is_encryption(u.csum_type) ==
293 bch2_csum_type_is_encryption(n.csum_type);
296 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
297 struct bch_extent_crc_unpacked n)
299 struct bch_extent_crc_unpacked crc;
300 const union bch_extent_entry *i;
305 extent_for_each_crc(e, crc, i)
306 if (can_narrow_crc(crc, n))
313 * We're writing another replica for this extent, so while we've got the data in
314 * memory we'll be computing a new checksum for the currently live data.
316 * If there are other replicas we aren't moving, and they are checksummed but
317 * not compressed, we can modify them to point to only the data that is
318 * currently live (so that readers won't have to bounce) while we've got the
321 bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
322 struct bch_extent_crc_unpacked n)
324 struct bch_extent_crc_unpacked u;
325 struct bch_extent_ptr *ptr;
326 union bch_extent_entry *i;
328 /* Find a checksum entry that covers only live data: */
330 extent_for_each_crc(extent_i_to_s(e), u, i)
331 if (!u.compression_type &&
333 u.live_size == u.uncompressed_size) {
338 if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
341 BUG_ON(n.compression_type);
343 BUG_ON(n.live_size != e->k.size);
345 bch2_extent_crc_append(e, n);
346 restart_narrow_pointers:
347 extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
348 if (can_narrow_crc(u, n)) {
349 ptr->offset += u.offset;
350 extent_ptr_append(e, *ptr);
351 __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
352 goto restart_narrow_pointers;
355 bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
359 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
361 union bch_extent_entry *entry = e.v->start;
362 union bch_extent_crc *crc, *prev = NULL;
363 struct bch_extent_crc_unpacked u, prev_u;
365 while (entry != extent_entry_last(e)) {
366 union bch_extent_entry *next = extent_entry_next(entry);
367 size_t crc_u64s = extent_entry_u64s(entry);
369 if (!extent_entry_is_crc(entry))
372 crc = entry_to_crc(entry);
373 u = bch2_extent_crc_unpack(e.k, crc);
375 if (next == extent_entry_last(e)) {
376 /* crc entry with no pointers after it: */
380 if (extent_entry_is_crc(next)) {
381 /* no pointers before next crc entry: */
385 if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
386 /* identical to previous crc entry: */
392 !u.compression_type) {
393 /* null crc entry: */
394 union bch_extent_entry *e2;
396 extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
397 if (!extent_entry_is_ptr(e2))
400 e2->ptr.offset += u.offset;
411 memmove_u64s_down(crc, next,
412 (u64 *) extent_entry_last(e) - (u64 *) next);
413 e.k->u64s -= crc_u64s;
416 EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
419 static bool should_drop_ptr(const struct bch_fs *c,
420 struct bkey_s_c_extent e,
421 const struct bch_extent_ptr *ptr)
423 return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
426 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
428 struct bch_extent_ptr *ptr = &e.v->start->ptr;
429 bool dropped = false;
431 while ((ptr = extent_ptr_next(e, ptr)))
432 if (should_drop_ptr(c, e.c, ptr)) {
433 __bch2_extent_drop_ptr(e, ptr);
439 bch2_extent_drop_redundant_crcs(e);
442 static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
445 return bch2_extent_normalize(c, k);
448 static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
452 case BCH_EXTENT_CACHED: {
453 union bch_extent_entry *entry;
454 u64 *d = (u64 *) bkeyp_val(f, k);
457 for (i = 0; i < bkeyp_val_u64s(f, k); i++)
460 for (entry = (union bch_extent_entry *) d;
461 entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
462 entry = extent_entry_next(entry)) {
463 switch (extent_entry_type(entry)) {
464 case BCH_EXTENT_ENTRY_crc32:
465 entry->crc32.csum = swab32(entry->crc32.csum);
467 case BCH_EXTENT_ENTRY_crc64:
468 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
469 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
471 case BCH_EXTENT_ENTRY_crc128:
472 entry->crc128.csum.hi = (__force __le64)
473 swab64((__force u64) entry->crc128.csum.hi);
474 entry->crc128.csum.lo = (__force __le64)
475 swab64((__force u64) entry->crc128.csum.lo);
477 case BCH_EXTENT_ENTRY_ptr:
486 static const char *extent_ptr_invalid(const struct bch_fs *c,
487 struct bkey_s_c_extent e,
488 const struct bch_extent_ptr *ptr,
489 unsigned size_ondisk,
492 const struct bch_extent_ptr *ptr2;
495 if (ptr->dev >= c->sb.nr_devices ||
497 return "pointer to invalid device";
499 ca = bch_dev_bkey_exists(c, ptr->dev);
501 return "pointer to invalid device";
503 extent_for_each_ptr(e, ptr2)
504 if (ptr != ptr2 && ptr->dev == ptr2->dev)
505 return "multiple pointers to same device";
507 if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
508 return "offset past end of device";
510 if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
511 return "offset before first bucket";
513 if (bucket_remainder(ca, ptr->offset) +
514 size_ondisk > ca->mi.bucket_size)
515 return "spans multiple buckets";
520 static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
521 size_t size, struct bkey_s_c_extent e)
523 char *out = buf, *end = buf + size;
524 const union bch_extent_entry *entry;
525 struct bch_extent_crc_unpacked crc;
526 const struct bch_extent_ptr *ptr;
530 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
532 extent_for_each_entry(e, entry) {
536 switch (__extent_entry_type(entry)) {
537 case BCH_EXTENT_ENTRY_crc32:
538 case BCH_EXTENT_ENTRY_crc64:
539 case BCH_EXTENT_ENTRY_crc128:
540 crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
542 p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
544 crc.uncompressed_size,
545 crc.offset, crc.nonce,
547 crc.compression_type);
549 case BCH_EXTENT_ENTRY_ptr:
550 ptr = entry_to_ptr(entry);
551 ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
552 ? bch_dev_bkey_exists(c, ptr->dev)
555 p("ptr: %u:%llu gen %u%s%s", ptr->dev,
556 (u64) ptr->offset, ptr->gen,
557 ptr->cached ? " cached" : "",
558 ca && ptr_stale(ca, ptr)
562 p("(invalid extent entry %.16llx)", *((u64 *) entry));
569 if (bkey_extent_is_cached(e.k))
575 static inline bool dev_latency_better(struct bch_dev *dev1,
576 struct bch_dev *dev2)
578 unsigned l1 = atomic_read(&dev1->latency[READ]);
579 unsigned l2 = atomic_read(&dev2->latency[READ]);
581 /* Pick at random, biased in favor of the faster device: */
583 return bch2_rand_range(l1 + l2) > l1;
586 static void extent_pick_read_device(struct bch_fs *c,
587 struct bkey_s_c_extent e,
588 struct bch_devs_mask *avoid,
589 struct extent_pick_ptr *pick)
591 const struct bch_extent_ptr *ptr;
592 struct bch_extent_crc_unpacked crc;
594 extent_for_each_ptr_crc(e, ptr, crc) {
595 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
597 if (ptr->cached && ptr_stale(ca, ptr))
600 if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
604 if (test_bit(ca->dev_idx, avoid->d))
608 test_bit(pick->ca->dev_idx, avoid->d))
612 if (pick->ca && !dev_latency_better(ca, pick->ca))
615 if (!percpu_ref_tryget(&ca->io_ref))
619 percpu_ref_put(&pick->ca->io_ref);
621 *pick = (struct extent_pick_ptr) {
631 static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
634 if (bkey_extent_is_cached(k.k))
638 return "nonzero key size";
640 if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
641 return "value too big";
645 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
646 const union bch_extent_entry *entry;
647 const struct bch_extent_ptr *ptr;
650 extent_for_each_entry(e, entry) {
651 if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
652 return "invalid extent entry type";
654 if (extent_entry_is_crc(entry))
655 return "has crc field";
658 extent_for_each_ptr(e, ptr) {
659 reason = extent_ptr_invalid(c, e, ptr,
660 c->opts.btree_node_size,
670 return "invalid value type";
674 static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
677 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
678 const struct bch_extent_ptr *ptr;
682 struct bucket_mark mark;
684 unsigned replicas = 0;
687 extent_for_each_ptr(e, ptr) {
688 ca = bch_dev_bkey_exists(c, ptr->dev);
691 if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
695 if (ptr_stale(ca, ptr))
699 seq = read_seqcount_begin(&c->gc_pos_lock);
700 mark = ptr_bucket_mark(ca, ptr);
702 bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
703 (mark.data_type != BCH_DATA_BTREE ||
704 mark.dirty_sectors < c->opts.btree_node_size);
705 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
707 err = "inconsistent";
712 if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
713 bch2_bkey_val_to_text(c, btree_node_type(b),
714 buf, sizeof(buf), k);
716 "btree key bad (replicas not marked in superblock):\n%s",
723 bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
724 bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
726 err, buf, PTR_BUCKET_NR(ca, ptr),
727 mark.gen, (unsigned) mark.counter);
730 static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
731 size_t size, struct bkey_s_c k)
733 char *out = buf, *end = buf + size;
736 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
738 if (bkey_extent_is_data(k.k))
739 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
741 invalid = bch2_btree_ptr_invalid(c, k);
743 p(" invalid: %s", invalid);
747 struct extent_pick_ptr
748 bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
749 struct bch_devs_mask *avoid)
751 struct extent_pick_ptr pick = { .ca = NULL };
753 extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
759 const struct bkey_ops bch2_bkey_btree_ops = {
760 .key_invalid = bch2_btree_ptr_invalid,
761 .key_debugcheck = btree_ptr_debugcheck,
762 .val_to_text = bch2_btree_ptr_to_text,
763 .swab = bch2_ptr_swab,
768 static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
772 if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
775 EBUG_ON(bkey_cmp(where, k.k->p) > 0);
777 len = k.k->p.offset - where.offset;
779 BUG_ON(len > k.k->size);
782 * Don't readjust offset if the key size is now 0, because that could
783 * cause offset to point to the next bucket:
786 __set_bkey_deleted(k.k);
787 else if (bkey_extent_is_data(k.k)) {
788 struct bkey_s_extent e = bkey_s_to_extent(k);
789 union bch_extent_entry *entry;
790 bool seen_crc = false;
792 extent_for_each_entry(e, entry) {
793 switch (extent_entry_type(entry)) {
794 case BCH_EXTENT_ENTRY_ptr:
796 entry->ptr.offset += e.k->size - len;
798 case BCH_EXTENT_ENTRY_crc32:
799 entry->crc32.offset += e.k->size - len;
801 case BCH_EXTENT_ENTRY_crc64:
802 entry->crc64.offset += e.k->size - len;
804 case BCH_EXTENT_ENTRY_crc128:
805 entry->crc128.offset += e.k->size - len;
809 if (extent_entry_is_crc(entry))
819 bool bch2_cut_front(struct bpos where, struct bkey_i *k)
821 return __bch2_cut_front(where, bkey_i_to_s(k));
824 bool bch2_cut_back(struct bpos where, struct bkey *k)
828 if (bkey_cmp(where, k->p) >= 0)
831 EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
833 len = where.offset - bkey_start_offset(k);
835 BUG_ON(len > k->size);
841 __set_bkey_deleted(k);
847 * bch_key_resize - adjust size of @k
849 * bkey_start_offset(k) will be preserved, modifies where the extent ends
851 void bch2_key_resize(struct bkey *k,
854 k->p.offset -= k->size;
855 k->p.offset += new_size;
860 * In extent_sort_fix_overlapping(), insert_fixup_extent(),
861 * extent_merge_inline() - we're modifying keys in place that are packed. To do
862 * that we have to unpack the key, modify the unpacked key - then this
863 * copies/repacks the unpacked to the original as necessary.
865 static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
866 struct bkey_packed *dst, struct bkey *src)
868 struct bkey_format *f = &b->format;
869 struct bkey_i *dst_unpacked;
872 if ((dst_unpacked = packed_to_bkey(dst))) {
873 dst_unpacked->k = *src;
876 ret = bch2_bkey_pack_key(dst, src, f);
880 bch2_verify_key_order(b, iter, dst);
885 static void extent_save(struct btree *b, struct btree_node_iter *iter,
886 struct bkey_packed *dst, struct bkey *src)
888 BUG_ON(!__extent_save(b, iter, dst, src));
892 * If keys compare equal, compare by pointer order:
894 * Necessary for sort_fix_overlapping() - if there are multiple keys that
895 * compare equal in different sets, we have to process them newest to oldest.
897 #define extent_sort_cmp(h, l, r) \
899 struct bkey _ul = bkey_unpack_key(b, \
900 __btree_node_offset_to_key(b, (l).k)); \
901 struct bkey _ur = bkey_unpack_key(b, \
902 __btree_node_offset_to_key(b, (r).k)); \
904 bkey_cmp(bkey_start_pos(&_ul), \
905 bkey_start_pos(&_ur)) ?: (r).k - (l).k; \
908 static inline void extent_sort_sift(struct btree_node_iter_large *iter,
909 struct btree *b, size_t i)
911 heap_sift_down(iter, i, extent_sort_cmp);
914 static inline void extent_sort_next(struct btree_node_iter_large *iter,
916 struct btree_node_iter_set *i)
918 sort_key_next(iter, b, i);
919 heap_sift_down(iter, i - iter->data, extent_sort_cmp);
922 static void extent_sort_append(struct bch_fs *c,
924 struct btree_nr_keys *nr,
925 struct bkey_packed *start,
926 struct bkey_packed **prev,
927 struct bkey_packed *k)
929 struct bkey_format *f = &b->format;
932 if (bkey_whiteout(k))
935 bch2_bkey_unpack(b, &tmp.k, k);
938 bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
942 bch2_bkey_pack(*prev, (void *) *prev, f);
944 btree_keys_account_key_add(nr, 0, *prev);
945 *prev = bkey_next(*prev);
950 bkey_copy(*prev, &tmp.k);
953 struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
956 struct btree_node_iter_large *iter)
958 struct bkey_format *f = &b->format;
959 struct btree_node_iter_set *_l = iter->data, *_r;
960 struct bkey_packed *prev = NULL, *out, *lk, *rk;
961 struct bkey l_unpacked, r_unpacked;
963 struct btree_nr_keys nr;
965 memset(&nr, 0, sizeof(nr));
967 heap_resort(iter, extent_sort_cmp);
969 while (!bch2_btree_node_iter_large_end(iter)) {
970 lk = __btree_node_offset_to_key(b, _l->k);
972 if (iter->used == 1) {
973 extent_sort_append(c, b, &nr, dst->start, &prev, lk);
974 extent_sort_next(iter, b, _l);
979 if (iter->used > 2 &&
980 extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
983 rk = __btree_node_offset_to_key(b, _r->k);
985 l = __bkey_disassemble(b, lk, &l_unpacked);
986 r = __bkey_disassemble(b, rk, &r_unpacked);
988 /* If current key and next key don't overlap, just append */
989 if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
990 extent_sort_append(c, b, &nr, dst->start, &prev, lk);
991 extent_sort_next(iter, b, _l);
995 /* Skip 0 size keys */
997 extent_sort_next(iter, b, _r);
1002 * overlap: keep the newer key and trim the older key so they
1003 * don't overlap. comparing pointers tells us which one is
1004 * newer, since the bsets are appended one after the other.
1007 /* can't happen because of comparison func */
1008 BUG_ON(_l->k < _r->k &&
1009 !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
1011 if (_l->k > _r->k) {
1012 /* l wins, trim r */
1013 if (bkey_cmp(l.k->p, r.k->p) >= 0) {
1014 sort_key_next(iter, b, _r);
1016 __bch2_cut_front(l.k->p, r);
1017 extent_save(b, NULL, rk, r.k);
1020 extent_sort_sift(iter, b, _r - iter->data);
1021 } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
1025 * r wins, but it overlaps in the middle of l - split l:
1027 bkey_reassemble(&tmp.k, l.s_c);
1028 bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
1030 __bch2_cut_front(r.k->p, l);
1031 extent_save(b, NULL, lk, l.k);
1033 extent_sort_sift(iter, b, 0);
1035 extent_sort_append(c, b, &nr, dst->start, &prev,
1036 bkey_to_packed(&tmp.k));
1038 bch2_cut_back(bkey_start_pos(r.k), l.k);
1039 extent_save(b, NULL, lk, l.k);
1044 bch2_bkey_pack(prev, (void *) prev, f);
1045 btree_keys_account_key_add(&nr, 0, prev);
1046 out = bkey_next(prev);
1051 dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
1055 struct extent_insert_state {
1056 struct btree_insert *trans;
1057 struct btree_insert_entry *insert;
1058 struct bpos committed;
1059 struct bch_fs_usage stats;
1062 struct bkey_i whiteout;
1067 static void bch2_add_sectors(struct extent_insert_state *s,
1068 struct bkey_s_c k, u64 offset, s64 sectors)
1070 struct bch_fs *c = s->trans->c;
1071 struct btree *b = s->insert->iter->l[0].b;
1073 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
1078 bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
1079 &s->stats, s->trans->journal_res.seq, 0);
1082 static void bch2_subtract_sectors(struct extent_insert_state *s,
1083 struct bkey_s_c k, u64 offset, s64 sectors)
1085 bch2_add_sectors(s, k, offset, -sectors);
1088 /* These wrappers subtract exactly the sectors that we're removing from @k */
1089 static void bch2_cut_subtract_back(struct extent_insert_state *s,
1090 struct bpos where, struct bkey_s k)
1092 bch2_subtract_sectors(s, k.s_c, where.offset,
1093 k.k->p.offset - where.offset);
1094 bch2_cut_back(where, k.k);
1097 static void bch2_cut_subtract_front(struct extent_insert_state *s,
1098 struct bpos where, struct bkey_s k)
1100 bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
1101 where.offset - bkey_start_offset(k.k));
1102 __bch2_cut_front(where, k);
1105 static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
1108 bch2_subtract_sectors(s, k.s_c,
1109 bkey_start_offset(k.k), k.k->size);
1111 __set_bkey_deleted(k.k);
1114 static bool bch2_extent_merge_inline(struct bch_fs *,
1115 struct btree_iter *,
1116 struct bkey_packed *,
1117 struct bkey_packed *,
1120 #define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC)
1122 static enum btree_insert_ret
1123 extent_insert_should_stop(struct extent_insert_state *s)
1125 struct btree *b = s->insert->iter->l[0].b;
1128 * Check if we have sufficient space in both the btree node and the
1129 * journal reservation:
1131 * Each insert checks for room in the journal entry, but we check for
1132 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
1133 * will insert two keys, and one iteration of this room will insert one
1134 * key, so we need room for three keys.
1136 if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
1137 return BTREE_INSERT_BTREE_NODE_FULL;
1138 else if (!journal_res_insert_fits(s->trans, s->insert))
1139 return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
1141 return BTREE_INSERT_OK;
1144 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
1145 struct bkey_i *insert)
1147 struct btree_iter_level *l = &iter->l[0];
1148 struct bset_tree *t = bset_tree_last(l->b);
1149 struct bkey_packed *where =
1150 bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
1151 struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where);
1152 struct bkey_packed *next_live_key = where;
1153 unsigned clobber_u64s;
1156 where = bkey_next(prev);
1158 while (next_live_key != btree_bkey_last(l->b, t) &&
1159 bkey_deleted(next_live_key))
1160 next_live_key = bkey_next(next_live_key);
1163 * Everything between where and next_live_key is now deleted keys, and
1166 clobber_u64s = (u64 *) next_live_key - (u64 *) where;
1169 bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
1170 goto drop_deleted_keys;
1172 if (next_live_key != btree_bkey_last(l->b, t) &&
1173 bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
1174 next_live_key, false))
1175 goto drop_deleted_keys;
1177 bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
1178 bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
1179 clobber_u64s, where->u64s);
1182 bch2_bset_delete(l->b, where, clobber_u64s);
1183 bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
1184 where, clobber_u64s, 0);
1187 static void extent_insert_committed(struct extent_insert_state *s)
1189 struct bch_fs *c = s->trans->c;
1190 struct btree_iter *iter = s->insert->iter;
1191 struct bkey_i *insert = !s->deleting
1194 BKEY_PADDED(k) split;
1196 EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
1197 EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
1199 if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
1202 if (s->deleting && !s->do_journal) {
1203 bch2_cut_front(s->committed, insert);
1207 EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
1209 bkey_copy(&split.k, insert);
1211 if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
1212 bkey_cmp(s->committed, insert->k.p) &&
1213 bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
1214 /* XXX: possibly need to increase our reservation? */
1215 bch2_cut_subtract_back(s, s->committed,
1216 bkey_i_to_s(&split.k));
1217 bch2_cut_front(s->committed, insert);
1218 bch2_add_sectors(s, bkey_i_to_s_c(insert),
1219 bkey_start_offset(&insert->k),
1222 bch2_cut_back(s->committed, &split.k.k);
1223 bch2_cut_front(s->committed, insert);
1226 if (debug_check_bkeys(c))
1227 bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
1229 bch2_btree_journal_key(s->trans, iter, &split.k);
1232 extent_bset_insert(c, iter, &split.k);
1234 bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
1236 insert->k.needs_whiteout = false;
1237 s->do_journal = false;
1238 s->trans->did_work = true;
1241 static enum btree_insert_ret
1242 __extent_insert_advance_pos(struct extent_insert_state *s,
1243 struct bpos next_pos,
1246 struct extent_insert_hook *hook = s->trans->hook;
1247 enum btree_insert_ret ret;
1250 ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
1252 ret = BTREE_INSERT_OK;
1254 EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
1256 if (ret == BTREE_INSERT_OK)
1257 s->committed = next_pos;
1263 * Update iter->pos, marking how much of @insert we've processed, and call hook
1266 static enum btree_insert_ret
1267 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
1269 struct btree *b = s->insert->iter->l[0].b;
1270 struct bpos next_pos = bpos_min(s->insert->k->k.p,
1271 k.k ? k.k->p : b->key.k.p);
1272 enum btree_insert_ret ret;
1275 return BTREE_INSERT_NEED_TRAVERSE;
1278 if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
1279 ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
1281 if (ret != BTREE_INSERT_OK)
1285 /* avoid redundant calls to hook fn: */
1286 if (!bkey_cmp(s->committed, next_pos))
1287 return BTREE_INSERT_OK;
1289 return __extent_insert_advance_pos(s, next_pos, k);
1292 static enum btree_insert_ret
1293 extent_insert_check_split_compressed(struct extent_insert_state *s,
1295 enum bch_extent_overlap overlap)
1297 struct bch_fs *c = s->trans->c;
1300 if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
1301 (sectors = bch2_extent_is_compressed(k))) {
1302 int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
1304 if (s->trans->flags & BTREE_INSERT_NOFAIL)
1305 flags |= BCH_DISK_RESERVATION_NOFAIL;
1307 switch (bch2_disk_reservation_add(c,
1309 sectors * bch2_extent_nr_dirty_ptrs(k),
1314 return BTREE_INSERT_ENOSPC;
1316 return BTREE_INSERT_NEED_GC_LOCK;
1322 return BTREE_INSERT_OK;
1325 static enum btree_insert_ret
1326 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
1327 struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
1328 enum bch_extent_overlap overlap)
1330 struct bch_fs *c = s->trans->c;
1331 struct btree_iter *iter = s->insert->iter;
1332 struct btree_iter_level *l = &iter->l[0];
1333 struct btree *b = l->b;
1334 struct btree_node_iter *node_iter = &l->iter;
1335 enum btree_insert_ret ret;
1338 case BCH_EXTENT_OVERLAP_FRONT:
1339 /* insert overlaps with start of k: */
1340 bch2_cut_subtract_front(s, insert->k.p, k);
1341 BUG_ON(bkey_deleted(k.k));
1342 extent_save(b, node_iter, _k, k.k);
1345 case BCH_EXTENT_OVERLAP_BACK:
1346 /* insert overlaps with end of k: */
1347 bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
1348 BUG_ON(bkey_deleted(k.k));
1349 extent_save(b, node_iter, _k, k.k);
1352 * As the auxiliary tree is indexed by the end of the
1353 * key and we've just changed the end, update the
1356 bch2_bset_fix_invalidated_key(b, t, _k);
1357 bch2_btree_node_iter_fix(iter, b, node_iter, t,
1358 _k, _k->u64s, _k->u64s);
1361 case BCH_EXTENT_OVERLAP_ALL: {
1362 struct bpos orig_pos = k.k->p;
1364 /* The insert key completely covers k, invalidate k */
1365 if (!bkey_whiteout(k.k))
1366 btree_keys_account_key_drop(&b->nr,
1369 bch2_drop_subtract(s, k);
1370 k.k->p = bkey_start_pos(&insert->k);
1371 if (!__extent_save(b, node_iter, _k, k.k)) {
1373 * Couldn't repack: we aren't necessarily able
1374 * to repack if the new key is outside the range
1375 * of the old extent, so we have to split
1379 extent_save(b, node_iter, _k, k.k);
1381 ret = extent_insert_advance_pos(s, k.s_c);
1382 if (ret != BTREE_INSERT_OK)
1385 extent_insert_committed(s);
1387 * We split and inserted upto at k.k->p - that
1388 * has to coincide with iter->pos, so that we
1389 * don't have anything more we have to insert
1390 * until we recheck our journal reservation:
1392 EBUG_ON(bkey_cmp(s->committed, k.k->p));
1394 bch2_bset_fix_invalidated_key(b, t, _k);
1395 bch2_btree_node_iter_fix(iter, b, node_iter, t,
1396 _k, _k->u64s, _k->u64s);
1401 case BCH_EXTENT_OVERLAP_MIDDLE: {
1402 BKEY_PADDED(k) split;
1404 * The insert key falls 'in the middle' of k
1405 * The insert key splits k in 3:
1406 * - start only in k, preserve
1407 * - middle common section, invalidate in k
1408 * - end only in k, preserve
1410 * We update the old key to preserve the start,
1411 * insert will be the new common section,
1412 * we manually insert the end that we are preserving.
1414 * modify k _before_ doing the insert (which will move
1417 bkey_reassemble(&split.k, k.s_c);
1418 split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
1420 bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
1421 BUG_ON(bkey_deleted(&split.k.k));
1423 bch2_cut_subtract_front(s, insert->k.p, k);
1424 BUG_ON(bkey_deleted(k.k));
1425 extent_save(b, node_iter, _k, k.k);
1427 bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
1428 bkey_start_offset(&split.k.k),
1430 extent_bset_insert(c, iter, &split.k);
1435 return BTREE_INSERT_OK;
1438 static enum btree_insert_ret
1439 bch2_delete_fixup_extent(struct extent_insert_state *s)
1441 struct bch_fs *c = s->trans->c;
1442 struct btree_iter *iter = s->insert->iter;
1443 struct btree_iter_level *l = &iter->l[0];
1444 struct btree *b = l->b;
1445 struct btree_node_iter *node_iter = &l->iter;
1446 struct bkey_packed *_k;
1447 struct bkey unpacked;
1448 struct bkey_i *insert = s->insert->k;
1449 enum btree_insert_ret ret = BTREE_INSERT_OK;
1451 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1453 s->whiteout = *insert;
1454 s->do_journal = false;
1456 while (bkey_cmp(s->committed, insert->k.p) < 0 &&
1457 (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
1458 (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
1459 struct bset_tree *t = bch2_bkey_to_bset(b, _k);
1460 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1461 enum bch_extent_overlap overlap;
1463 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1464 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1466 if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
1469 if (bkey_whiteout(k.k)) {
1470 s->committed = bpos_min(insert->k.p, k.k->p);
1474 overlap = bch2_extent_overlap(&insert->k, k.k);
1476 ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
1477 if (ret != BTREE_INSERT_OK)
1480 ret = extent_insert_advance_pos(s, k.s_c);
1484 s->do_journal = true;
1486 if (overlap == BCH_EXTENT_OVERLAP_ALL) {
1487 btree_keys_account_key_drop(&b->nr,
1489 bch2_subtract_sectors(s, k.s_c,
1490 bkey_start_offset(k.k), k.k->size);
1491 _k->type = KEY_TYPE_DISCARD;
1492 reserve_whiteout(b, t, _k);
1493 } else if (k.k->needs_whiteout ||
1494 bset_written(b, bset(b, t))) {
1495 struct bkey_i discard = *insert;
1498 case BCH_EXTENT_OVERLAP_FRONT:
1499 bch2_cut_front(bkey_start_pos(k.k), &discard);
1501 case BCH_EXTENT_OVERLAP_BACK:
1502 bch2_cut_back(k.k->p, &discard.k);
1508 discard.k.needs_whiteout = true;
1510 ret = extent_squash(s, insert, t, _k, k, overlap);
1511 BUG_ON(ret != BTREE_INSERT_OK);
1513 extent_bset_insert(c, iter, &discard);
1515 ret = extent_squash(s, insert, t, _k, k, overlap);
1516 BUG_ON(ret != BTREE_INSERT_OK);
1519 bch2_cut_front(s->committed, insert);
1520 bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
1523 if (ret == BTREE_INSERT_OK &&
1524 bkey_cmp(s->committed, insert->k.p) < 0)
1525 ret = extent_insert_advance_pos(s, bkey_s_c_null);
1527 extent_insert_committed(s);
1529 bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
1530 gc_pos_btree_node(b));
1532 EBUG_ON(bkey_cmp(iter->pos, s->committed));
1533 EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
1534 !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
1536 bch2_cut_front(iter->pos, insert);
1538 if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
1539 ret = BTREE_INSERT_NEED_TRAVERSE;
1541 EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
1547 * bch_extent_insert_fixup - insert a new extent and deal with overlaps
1549 * this may result in not actually doing the insert, or inserting some subset
1550 * of the insert key. For cmpxchg operations this is where that logic lives.
1552 * All subsets of @insert that need to be inserted are inserted using
1553 * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
1554 * returns false, setting @iter->pos for the prefix of @insert that actually got
1557 * BSET INVARIANTS: this function is responsible for maintaining all the
1558 * invariants for bsets of extents in memory. things get really hairy with 0
1563 * bkey_start_pos(bkey_next(k)) >= k
1564 * or bkey_start_offset(bkey_next(k)) >= k->offset
1566 * i.e. strict ordering, no overlapping extents.
1568 * multiple bsets (i.e. full btree node):
1571 * k.size != 0 ∧ j.size != 0 →
1572 * ¬ (k > bkey_start_pos(j) ∧ k < j)
1574 * i.e. no two overlapping keys _of nonzero size_
1576 * We can't realistically maintain this invariant for zero size keys because of
1577 * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
1578 * there may be another 0 size key between them in another bset, and it will
1579 * thus overlap with the merged key.
1581 * In addition, the end of iter->pos indicates how much has been processed.
1582 * If the end of iter->pos is not the same as the end of insert, then
1583 * key insertion needs to continue/be retried.
1585 enum btree_insert_ret
1586 bch2_insert_fixup_extent(struct btree_insert *trans,
1587 struct btree_insert_entry *insert)
1589 struct bch_fs *c = trans->c;
1590 struct btree_iter *iter = insert->iter;
1591 struct btree_iter_level *l = &iter->l[0];
1592 struct btree *b = l->b;
1593 struct btree_node_iter *node_iter = &l->iter;
1594 struct bkey_packed *_k;
1595 struct bkey unpacked;
1596 enum btree_insert_ret ret = BTREE_INSERT_OK;
1598 struct extent_insert_state s = {
1601 .committed = insert->iter->pos,
1602 .deleting = bkey_whiteout(&insert->k->k),
1605 EBUG_ON(iter->level);
1606 EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
1609 return bch2_delete_fixup_extent(&s);
1612 * As we process overlapping extents, we advance @iter->pos both to
1613 * signal to our caller (btree_insert_key()) how much of @insert->k has
1614 * been inserted, and also to keep @iter->pos consistent with
1615 * @insert->k and the node iterator that we're advancing:
1617 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1619 if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1620 bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
1621 bkey_start_offset(&insert->k->k),
1624 while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
1625 (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
1626 (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
1627 struct bset_tree *t = bch2_bkey_to_bset(b, _k);
1628 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1629 enum bch_extent_overlap overlap;
1631 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1632 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1634 if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
1637 overlap = bch2_extent_overlap(&insert->k->k, k.k);
1639 ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
1640 if (ret != BTREE_INSERT_OK)
1647 * Only call advance pos & call hook for nonzero size extents:
1649 ret = extent_insert_advance_pos(&s, k.s_c);
1650 if (ret != BTREE_INSERT_OK)
1654 (k.k->needs_whiteout || bset_written(b, bset(b, t))))
1655 insert->k->k.needs_whiteout = true;
1657 if (overlap == BCH_EXTENT_OVERLAP_ALL &&
1658 bkey_whiteout(k.k) &&
1659 k.k->needs_whiteout) {
1660 unreserve_whiteout(b, t, _k);
1661 _k->needs_whiteout = false;
1664 ret = extent_squash(&s, insert->k, t, _k, k, overlap);
1665 if (ret != BTREE_INSERT_OK)
1669 if (ret == BTREE_INSERT_OK &&
1670 bkey_cmp(s.committed, insert->k->k.p) < 0)
1671 ret = extent_insert_advance_pos(&s, bkey_s_c_null);
1673 extent_insert_committed(&s);
1675 * Subtract any remaining sectors from @insert, if we bailed out early
1676 * and didn't fully insert @insert:
1678 if (insert->k->k.size &&
1679 !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1680 bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
1681 bkey_start_offset(&insert->k->k),
1684 bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
1685 gc_pos_btree_node(b));
1687 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1688 EBUG_ON(bkey_cmp(iter->pos, s.committed));
1689 EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
1690 !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
1692 if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
1693 ret = BTREE_INSERT_NEED_TRAVERSE;
1695 EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
1700 static const char *bch2_extent_invalid(const struct bch_fs *c,
1703 if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
1704 return "value too big";
1707 return "zero key size";
1709 switch (k.k->type) {
1711 case BCH_EXTENT_CACHED: {
1712 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
1713 const union bch_extent_entry *entry;
1714 struct bch_extent_crc_unpacked crc;
1715 const struct bch_extent_ptr *ptr;
1716 unsigned size_ondisk = e.k->size;
1718 unsigned nonce = UINT_MAX;
1720 extent_for_each_entry(e, entry) {
1721 if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
1722 return "invalid extent entry type";
1724 if (extent_entry_is_crc(entry)) {
1725 crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
1727 if (crc.offset + e.k->size >
1728 crc.uncompressed_size)
1729 return "checksum offset + key size > uncompressed size";
1731 size_ondisk = crc.compressed_size;
1733 if (!bch2_checksum_type_valid(c, crc.csum_type))
1734 return "invalid checksum type";
1736 if (crc.compression_type >= BCH_COMPRESSION_NR)
1737 return "invalid compression type";
1739 if (bch2_csum_type_is_encryption(crc.csum_type)) {
1740 if (nonce == UINT_MAX)
1741 nonce = crc.offset + crc.nonce;
1742 else if (nonce != crc.offset + crc.nonce)
1743 return "incorrect nonce";
1746 ptr = entry_to_ptr(entry);
1748 reason = extent_ptr_invalid(c, e, &entry->ptr,
1749 size_ondisk, false);
1758 case BCH_RESERVATION: {
1759 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
1761 if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
1762 return "incorrect value size";
1764 if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
1765 return "invalid nr_replicas";
1771 return "invalid value type";
1775 static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
1776 struct bkey_s_c_extent e)
1778 const struct bch_extent_ptr *ptr;
1780 struct bucket_mark mark;
1781 unsigned seq, stale;
1784 unsigned replicas = 0;
1787 * XXX: we should be doing most/all of these checks at startup time,
1788 * where we check bch2_bkey_invalid() in btree_node_read_done()
1790 * But note that we can't check for stale pointers or incorrect gc marks
1791 * until after journal replay is done (it might be an extent that's
1792 * going to get overwritten during replay)
1795 extent_for_each_ptr(e, ptr) {
1796 ca = bch_dev_bkey_exists(c, ptr->dev);
1800 * If journal replay hasn't finished, we might be seeing keys
1801 * that will be overwritten by the time journal replay is done:
1803 if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
1809 seq = read_seqcount_begin(&c->gc_pos_lock);
1810 mark = ptr_bucket_mark(ca, ptr);
1812 /* between mark and bucket gen */
1815 stale = ptr_stale(ca, ptr);
1817 bch2_fs_bug_on(stale && !ptr->cached, c,
1818 "stale dirty pointer");
1820 bch2_fs_bug_on(stale > 96, c,
1821 "key too stale: %i",
1827 bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
1828 (mark.data_type != BCH_DATA_USER ||
1830 ? mark.cached_sectors
1831 : mark.dirty_sectors));
1832 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
1838 if (replicas > BCH_REPLICAS_MAX) {
1839 bch2_bkey_val_to_text(c, btree_node_type(b), buf,
1840 sizeof(buf), e.s_c);
1842 "extent key bad (too many replicas: %u): %s",
1847 if (!bkey_extent_is_cached(e.k) &&
1848 !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
1849 bch2_bkey_val_to_text(c, btree_node_type(b),
1850 buf, sizeof(buf), e.s_c);
1852 "extent key bad (replicas not marked in superblock):\n%s",
1860 bch2_bkey_val_to_text(c, btree_node_type(b), buf,
1861 sizeof(buf), e.s_c);
1862 bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
1863 "gen %i type %u", buf,
1864 PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
1868 static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
1871 switch (k.k->type) {
1873 case BCH_EXTENT_CACHED:
1874 bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
1876 case BCH_RESERVATION:
1883 static void bch2_extent_to_text(struct bch_fs *c, char *buf,
1884 size_t size, struct bkey_s_c k)
1886 char *out = buf, *end = buf + size;
1887 const char *invalid;
1889 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
1891 if (bkey_extent_is_data(k.k))
1892 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
1894 invalid = bch2_extent_invalid(c, k);
1896 p(" invalid: %s", invalid);
1900 static void bch2_extent_crc_init(union bch_extent_crc *crc,
1901 struct bch_extent_crc_unpacked new)
1903 #define common_fields(_crc) \
1904 .csum_type = _crc.csum_type, \
1905 .compression_type = _crc.compression_type, \
1906 ._compressed_size = _crc.compressed_size - 1, \
1907 ._uncompressed_size = _crc.uncompressed_size - 1, \
1908 .offset = _crc.offset
1910 if (bch_crc_bytes[new.csum_type] <= 4 &&
1911 new.uncompressed_size <= CRC32_SIZE_MAX &&
1912 new.nonce <= CRC32_NONCE_MAX) {
1913 crc->crc32 = (struct bch_extent_crc32) {
1914 .type = 1 << BCH_EXTENT_ENTRY_crc32,
1916 .csum = *((__le32 *) &new.csum.lo),
1921 if (bch_crc_bytes[new.csum_type] <= 10 &&
1922 new.uncompressed_size <= CRC64_SIZE_MAX &&
1923 new.nonce <= CRC64_NONCE_MAX) {
1924 crc->crc64 = (struct bch_extent_crc64) {
1925 .type = 1 << BCH_EXTENT_ENTRY_crc64,
1928 .csum_lo = new.csum.lo,
1929 .csum_hi = *((__le16 *) &new.csum.hi),
1934 if (bch_crc_bytes[new.csum_type] <= 16 &&
1935 new.uncompressed_size <= CRC128_SIZE_MAX &&
1936 new.nonce <= CRC128_NONCE_MAX) {
1937 crc->crc128 = (struct bch_extent_crc128) {
1938 .type = 1 << BCH_EXTENT_ENTRY_crc128,
1945 #undef common_fields
1949 void bch2_extent_crc_append(struct bkey_i_extent *e,
1950 struct bch_extent_crc_unpacked new)
1952 struct bch_extent_crc_unpacked crc;
1953 const union bch_extent_entry *i;
1955 BUG_ON(new.compressed_size > new.uncompressed_size);
1956 BUG_ON(new.live_size != e->k.size);
1957 BUG_ON(!new.compressed_size || !new.uncompressed_size);
1960 * Look up the last crc entry, so we can check if we need to add
1963 extent_for_each_crc(extent_i_to_s(e), crc, i)
1966 if (!memcmp(&crc, &new, sizeof(crc)))
1969 bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
1970 __extent_entry_push(e);
1974 * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
1976 * Returns true if @k should be dropped entirely
1978 * For existing keys, only called when btree nodes are being rewritten, not when
1979 * they're merely being compacted/resorted in memory.
1981 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1983 struct bkey_s_extent e;
1985 switch (k.k->type) {
1986 case KEY_TYPE_ERROR:
1989 case KEY_TYPE_DELETED:
1990 case KEY_TYPE_COOKIE:
1993 case KEY_TYPE_DISCARD:
1994 return bversion_zero(k.k->version);
1997 case BCH_EXTENT_CACHED:
1998 e = bkey_s_to_extent(k);
2000 bch2_extent_drop_stale(c, e);
2002 if (!bkey_val_u64s(e.k)) {
2003 if (bkey_extent_is_cached(e.k)) {
2004 k.k->type = KEY_TYPE_DISCARD;
2005 if (bversion_zero(k.k->version))
2008 k.k->type = KEY_TYPE_ERROR;
2013 case BCH_RESERVATION:
2020 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
2021 struct bkey_s_extent e,
2023 unsigned nr_desired_replicas)
2025 struct bch_extent_ptr *ptr;
2026 int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
2028 if (target && extra > 0)
2029 extent_for_each_ptr(e, ptr) {
2030 int n = bch2_extent_ptr_durability(c, ptr);
2032 if (n && n <= extra &&
2033 !dev_in_target(c->devs[ptr->dev], target)) {
2040 extent_for_each_ptr(e, ptr) {
2041 int n = bch2_extent_ptr_durability(c, ptr);
2043 if (n && n <= extra) {
2051 * This picks a non-stale pointer, preferably from a device other than @avoid.
2052 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
2053 * other devices, it will still pick a pointer from avoid.
2055 void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
2056 struct bch_devs_mask *avoid,
2057 struct extent_pick_ptr *ret)
2059 struct bkey_s_c_extent e;
2061 switch (k.k->type) {
2062 case KEY_TYPE_DELETED:
2063 case KEY_TYPE_DISCARD:
2064 case KEY_TYPE_COOKIE:
2068 case KEY_TYPE_ERROR:
2069 ret->ca = ERR_PTR(-EIO);
2073 case BCH_EXTENT_CACHED:
2074 e = bkey_s_c_to_extent(k);
2077 extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
2079 if (!ret->ca && !bkey_extent_is_cached(e.k))
2080 ret->ca = ERR_PTR(-EIO);
2083 case BCH_RESERVATION:
2092 static enum merge_result bch2_extent_merge(struct bch_fs *c,
2094 struct bkey_i *l, struct bkey_i *r)
2096 struct bkey_s_extent el, er;
2097 union bch_extent_entry *en_l, *en_r;
2099 if (key_merging_disabled(c))
2100 return BCH_MERGE_NOMERGE;
2103 * Generic header checks
2104 * Assumes left and right are in order
2105 * Left and right must be exactly aligned
2108 if (l->k.u64s != r->k.u64s ||
2109 l->k.type != r->k.type ||
2110 bversion_cmp(l->k.version, r->k.version) ||
2111 bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
2112 return BCH_MERGE_NOMERGE;
2114 switch (l->k.type) {
2115 case KEY_TYPE_DELETED:
2116 case KEY_TYPE_DISCARD:
2117 case KEY_TYPE_ERROR:
2118 /* These types are mergeable, and no val to check */
2122 case BCH_EXTENT_CACHED:
2123 el = bkey_i_to_s_extent(l);
2124 er = bkey_i_to_s_extent(r);
2126 extent_for_each_entry(el, en_l) {
2127 struct bch_extent_ptr *lp, *rp;
2130 en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
2132 if ((extent_entry_type(en_l) !=
2133 extent_entry_type(en_r)) ||
2134 extent_entry_is_crc(en_l))
2135 return BCH_MERGE_NOMERGE;
2140 if (lp->offset + el.k->size != rp->offset ||
2141 lp->dev != rp->dev ||
2143 return BCH_MERGE_NOMERGE;
2145 /* We don't allow extents to straddle buckets: */
2146 ca = bch_dev_bkey_exists(c, lp->dev);
2148 if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
2149 return BCH_MERGE_NOMERGE;
2153 case BCH_RESERVATION: {
2154 struct bkey_i_reservation *li = bkey_i_to_reservation(l);
2155 struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
2157 if (li->v.generation != ri->v.generation ||
2158 li->v.nr_replicas != ri->v.nr_replicas)
2159 return BCH_MERGE_NOMERGE;
2163 return BCH_MERGE_NOMERGE;
2166 l->k.needs_whiteout |= r->k.needs_whiteout;
2168 /* Keys with no pointers aren't restricted to one bucket and could
2171 if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
2172 bch2_key_resize(&l->k, KEY_SIZE_MAX);
2173 bch2_cut_front(l->k.p, r);
2174 return BCH_MERGE_PARTIAL;
2177 bch2_key_resize(&l->k, l->k.size + r->k.size);
2179 return BCH_MERGE_MERGE;
2182 static void extent_i_save(struct btree *b, struct bkey_packed *dst,
2185 struct bkey_format *f = &b->format;
2186 struct bkey_i *dst_unpacked;
2188 BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
2191 * We don't want the bch2_verify_key_order() call in extent_save(),
2192 * because we may be out of order with deleted keys that are about to be
2193 * removed by extent_bset_insert()
2196 if ((dst_unpacked = packed_to_bkey(dst)))
2197 bkey_copy(dst_unpacked, src);
2199 BUG_ON(!bch2_bkey_pack(dst, src, f));
2202 static bool extent_merge_one_overlapping(struct btree_iter *iter,
2203 struct bpos new_pos,
2204 struct bset_tree *t,
2205 struct bkey_packed *k, struct bkey uk,
2206 bool check, bool could_pack)
2208 struct btree_iter_level *l = &iter->l[0];
2210 BUG_ON(!bkey_deleted(k));
2213 return !bkey_packed(k) || could_pack;
2216 extent_save(l->b, &l->iter, k, &uk);
2217 bch2_bset_fix_invalidated_key(l->b, t, k);
2218 bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
2219 k, k->u64s, k->u64s);
2224 static bool extent_merge_do_overlapping(struct btree_iter *iter,
2225 struct bkey *m, bool back_merge)
2227 struct btree_iter_level *l = &iter->l[0];
2228 struct btree *b = l->b;
2229 struct btree_node_iter *node_iter = &l->iter;
2230 struct bset_tree *t;
2231 struct bkey_packed *k;
2233 struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
2234 bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
2238 * @m is the new merged extent:
2240 * The merge took place in the last bset; we know there can't be any 0
2241 * size extents overlapping with m there because if so they would have
2242 * been between the two extents we merged.
2244 * But in the other bsets, we have to check for and fix such extents:
2247 for_each_bset(b, t) {
2248 if (t == bset_tree_last(b))
2252 * if we don't find this bset in the iterator we already got to
2253 * the end of that bset, so start searching from the end.
2255 k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
2257 if (k == btree_bkey_last(b, t))
2258 k = bch2_bkey_prev_all(b, t, k);
2264 * Back merge: 0 size extents will be before the key
2265 * that was just inserted (and thus the iterator
2266 * position) - walk backwards to find them
2270 (uk = bkey_unpack_key(b, k),
2271 bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
2272 k = bch2_bkey_prev_all(b, t, k)) {
2273 if (bkey_cmp(uk.p, m->p) >= 0)
2276 if (!extent_merge_one_overlapping(iter, new_pos,
2277 t, k, uk, check, could_pack))
2281 /* Front merge - walk forwards */
2283 k != btree_bkey_last(b, t) &&
2284 (uk = bkey_unpack_key(b, k),
2285 bkey_cmp(uk.p, m->p) < 0);
2288 bkey_start_pos(m)) <= 0)
2291 if (!extent_merge_one_overlapping(iter, new_pos,
2292 t, k, uk, check, could_pack))
2307 * When merging an extent that we're inserting into a btree node, the new merged
2308 * extent could overlap with an existing 0 size extent - if we don't fix that,
2309 * it'll break the btree node iterator so this code finds those 0 size extents
2310 * and shifts them out of the way.
2312 * Also unpacks and repacks.
2314 static bool bch2_extent_merge_inline(struct bch_fs *c,
2315 struct btree_iter *iter,
2316 struct bkey_packed *l,
2317 struct bkey_packed *r,
2320 struct btree *b = iter->l[0].b;
2321 struct btree_node_iter *node_iter = &iter->l[0].iter;
2322 const struct bkey_format *f = &b->format;
2323 struct bset_tree *t = bset_tree_last(b);
2324 struct bkey_packed *m;
2331 * We need to save copies of both l and r, because we might get a
2332 * partial merge (which modifies both) and then fails to repack
2334 bch2_bkey_unpack(b, &li.k, l);
2335 bch2_bkey_unpack(b, &ri.k, r);
2337 m = back_merge ? l : r;
2338 mi = back_merge ? &li.k : &ri.k;
2340 /* l & r should be in last bset: */
2341 EBUG_ON(bch2_bkey_to_bset(b, m) != t);
2343 switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
2344 case BCH_MERGE_NOMERGE:
2346 case BCH_MERGE_PARTIAL:
2347 if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
2350 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2353 extent_i_save(b, m, mi);
2354 bch2_bset_fix_invalidated_key(b, t, m);
2357 * Update iterator to reflect what we just inserted - otherwise,
2358 * the iter_fix() call is going to put us _before_ the key we
2359 * just partially merged with:
2362 bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
2364 bch2_btree_node_iter_fix(iter, b, node_iter,
2365 t, m, m->u64s, m->u64s);
2368 bkey_copy(packed_to_bkey(l), &li.k);
2370 bkey_copy(packed_to_bkey(r), &ri.k);
2372 case BCH_MERGE_MERGE:
2373 if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
2376 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2379 extent_i_save(b, m, &li.k);
2380 bch2_bset_fix_invalidated_key(b, t, m);
2382 bch2_btree_node_iter_fix(iter, b, node_iter,
2383 t, m, m->u64s, m->u64s);
2390 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
2392 struct btree_iter iter;
2393 struct bpos end = pos;
2399 for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
2400 BTREE_ITER_SLOTS, k) {
2401 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
2404 if (!bch2_extent_is_fully_allocated(k)) {
2409 bch2_btree_iter_unlock(&iter);
2414 const struct bkey_ops bch2_bkey_extent_ops = {
2415 .key_invalid = bch2_extent_invalid,
2416 .key_debugcheck = bch2_extent_debugcheck,
2417 .val_to_text = bch2_extent_to_text,
2418 .swab = bch2_ptr_swab,
2419 .key_normalize = bch2_ptr_normalize,
2420 .key_merge = bch2_extent_merge,