]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/extents.c
Update bcachefs sources to 14e9ac5016 bcachefs: btree_iter fastpath
[bcachefs-tools-debian] / libbcachefs / extents.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  *
4  * Code for managing the extent btree and dynamically updating the writeback
5  * dirty sector count.
6  */
7
8 #include "bcachefs.h"
9 #include "bkey_methods.h"
10 #include "btree_gc.h"
11 #include "btree_update.h"
12 #include "checksum.h"
13 #include "debug.h"
14 #include "dirent.h"
15 #include "error.h"
16 #include "extents.h"
17 #include "inode.h"
18 #include "journal.h"
19 #include "super-io.h"
20 #include "xattr.h"
21
22 #include <trace/events/bcachefs.h>
23
24 static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
25                                            struct bkey_i *, struct bkey_i *);
26
27 static void sort_key_next(struct btree_node_iter *iter,
28                           struct btree *b,
29                           struct btree_node_iter_set *i)
30 {
31         i->k += __btree_node_offset_to_key(b, i->k)->u64s;
32
33         if (i->k == i->end)
34                 *i = iter->data[--iter->used];
35 }
36
37 /*
38  * Returns true if l > r - unless l == r, in which case returns true if l is
39  * older than r.
40  *
41  * Necessary for btree_sort_fixup() - if there are multiple keys that compare
42  * equal in different sets, we have to process them newest to oldest.
43  */
44 #define key_sort_cmp(h, l, r)                                           \
45 ({                                                                      \
46         bkey_cmp_packed(b,                                              \
47                         __btree_node_offset_to_key(b, (l).k),           \
48                         __btree_node_offset_to_key(b, (r).k))           \
49                                                                         \
50         ?: (l).k - (r).k;                                               \
51 })
52
53 static inline bool should_drop_next_key(struct btree_node_iter *iter,
54                                         struct btree *b)
55 {
56         struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
57         struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
58
59         if (bkey_whiteout(k))
60                 return true;
61
62         if (iter->used < 2)
63                 return false;
64
65         if (iter->used > 2 &&
66             key_sort_cmp(iter, r[0], r[1]) >= 0)
67                 r++;
68
69         /*
70          * key_sort_cmp() ensures that when keys compare equal the older key
71          * comes first; so if l->k compares equal to r->k then l->k is older and
72          * should be dropped.
73          */
74         return !bkey_cmp_packed(b,
75                                 __btree_node_offset_to_key(b, l->k),
76                                 __btree_node_offset_to_key(b, r->k));
77 }
78
79 struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
80                                                   struct btree *b,
81                                                   struct btree_node_iter *iter)
82 {
83         struct bkey_packed *out = dst->start;
84         struct btree_nr_keys nr;
85
86         memset(&nr, 0, sizeof(nr));
87
88         heap_resort(iter, key_sort_cmp);
89
90         while (!bch2_btree_node_iter_end(iter)) {
91                 if (!should_drop_next_key(iter, b)) {
92                         struct bkey_packed *k =
93                                 __btree_node_offset_to_key(b, iter->data->k);
94
95                         bkey_copy(out, k);
96                         btree_keys_account_key_add(&nr, 0, out);
97                         out = bkey_next(out);
98                 }
99
100                 sort_key_next(iter, b, iter->data);
101                 heap_sift_down(iter, 0, key_sort_cmp);
102         }
103
104         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
105         return nr;
106 }
107
108 /* Common among btree and extent ptrs */
109
110 const struct bch_extent_ptr *
111 bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
112 {
113         const struct bch_extent_ptr *ptr;
114
115         extent_for_each_ptr(e, ptr)
116                 if (ptr->dev == dev)
117                         return ptr;
118
119         return NULL;
120 }
121
122 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
123 {
124         const struct bch_extent_ptr *ptr;
125         unsigned nr_ptrs = 0;
126
127         extent_for_each_ptr(e, ptr)
128                 nr_ptrs++;
129
130         return nr_ptrs;
131 }
132
133 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
134 {
135         struct bkey_s_c_extent e;
136         const struct bch_extent_ptr *ptr;
137         unsigned nr_ptrs = 0;
138
139         switch (k.k->type) {
140         case BCH_EXTENT:
141         case BCH_EXTENT_CACHED:
142                 e = bkey_s_c_to_extent(k);
143
144                 extent_for_each_ptr(e, ptr)
145                         nr_ptrs += !ptr->cached;
146                 break;
147
148         case BCH_RESERVATION:
149                 nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
150                 break;
151         }
152
153         return nr_ptrs;
154 }
155
156 /* Doesn't cleanup redundant crcs */
157 void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
158 {
159         EBUG_ON(ptr < &e.v->start->ptr ||
160                 ptr >= &extent_entry_last(e)->ptr);
161         EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
162         memmove_u64s_down(ptr, ptr + 1,
163                           (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
164         e.k->u64s -= sizeof(*ptr) / sizeof(u64);
165 }
166
167 void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
168 {
169         __bch2_extent_drop_ptr(e, ptr);
170         bch2_extent_drop_redundant_crcs(e);
171 }
172
173 void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx)
174 {
175         struct bch_extent_ptr *ptr;
176         unsigned i = 0;
177
178         extent_for_each_ptr(e, ptr)
179                 if (i++ == idx)
180                         goto found;
181
182         BUG();
183 found:
184         bch2_extent_drop_ptr(e, ptr);
185 }
186
187 /* returns true if equal */
188 static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
189 {
190         return extent_crc_type(l) == extent_crc_type(r) &&
191                 !memcmp(l, r, extent_entry_bytes(to_entry(l)));
192 }
193
194 /* Increment pointers after @crc by crc's offset until the next crc entry: */
195 void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
196 {
197         union bch_extent_entry *entry;
198
199         extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
200                 if (!extent_entry_is_ptr(entry))
201                         return;
202
203                 entry->ptr.offset += crc_offset(crc);
204         }
205 }
206
207 /*
208  * We're writing another replica for this extent, so while we've got the data in
209  * memory we'll be computing a new checksum for the currently live data.
210  *
211  * If there are other replicas we aren't moving, and they are checksummed but
212  * not compressed, we can modify them to point to only the data that is
213  * currently live (so that readers won't have to bounce) while we've got the
214  * checksum we need:
215  *
216  * XXX: to guard against data being corrupted while in memory, instead of
217  * recomputing the checksum here, it would be better in the read path to instead
218  * of computing the checksum of the entire extent:
219  *
220  * | extent                              |
221  *
222  * compute the checksums of the live and dead data separately
223  * | dead data || live data || dead data |
224  *
225  * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
226  * use crc_live here (that we verified was correct earlier)
227  *
228  * note: doesn't work with encryption
229  */
230 void bch2_extent_narrow_crcs(struct bkey_s_extent e)
231 {
232         union bch_extent_crc *crc;
233         bool have_wide = false, have_narrow = false;
234         struct bch_csum csum = { 0 };
235         unsigned csum_type = 0;
236
237         extent_for_each_crc(e, crc) {
238                 if (crc_compression_type(crc) ||
239                     bch2_csum_type_is_encryption(crc_csum_type(crc)))
240                         continue;
241
242                 if (crc_uncompressed_size(e.k, crc) != e.k->size) {
243                         have_wide = true;
244                 } else {
245                         have_narrow = true;
246                         csum = crc_csum(crc);
247                         csum_type = crc_csum_type(crc);
248                 }
249         }
250
251         if (!have_wide || !have_narrow)
252                 return;
253
254         extent_for_each_crc(e, crc) {
255                 if (crc_compression_type(crc))
256                         continue;
257
258                 if (crc_uncompressed_size(e.k, crc) != e.k->size) {
259                         switch (extent_crc_type(crc)) {
260                         case BCH_EXTENT_CRC_NONE:
261                                 BUG();
262                         case BCH_EXTENT_CRC32:
263                                 if (bch_crc_bytes[csum_type] > 4)
264                                         continue;
265
266                                 bch2_extent_crc_narrow_pointers(e, crc);
267                                 crc->crc32._compressed_size     = e.k->size - 1;
268                                 crc->crc32._uncompressed_size   = e.k->size - 1;
269                                 crc->crc32.offset               = 0;
270                                 crc->crc32.csum_type            = csum_type;
271                                 crc->crc32.csum                 = csum.lo;
272                                 break;
273                         case BCH_EXTENT_CRC64:
274                                 if (bch_crc_bytes[csum_type] > 10)
275                                         continue;
276
277                                 bch2_extent_crc_narrow_pointers(e, crc);
278                                 crc->crc64._compressed_size     = e.k->size - 1;
279                                 crc->crc64._uncompressed_size   = e.k->size - 1;
280                                 crc->crc64.offset               = 0;
281                                 crc->crc64.csum_type            = csum_type;
282                                 crc->crc64.csum_lo              = csum.lo;
283                                 crc->crc64.csum_hi              = csum.hi;
284                                 break;
285                         case BCH_EXTENT_CRC128:
286                                 if (bch_crc_bytes[csum_type] > 16)
287                                         continue;
288
289                                 bch2_extent_crc_narrow_pointers(e, crc);
290                                 crc->crc128._compressed_size    = e.k->size - 1;
291                                 crc->crc128._uncompressed_size  = e.k->size - 1;
292                                 crc->crc128.offset              = 0;
293                                 crc->crc128.csum_type           = csum_type;
294                                 crc->crc128.csum                = csum;
295                                 break;
296                         }
297                 }
298         }
299 }
300
301 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
302 {
303         union bch_extent_entry *entry = e.v->start;
304         union bch_extent_crc *crc, *prev = NULL;
305
306         while (entry != extent_entry_last(e)) {
307                 union bch_extent_entry *next = extent_entry_next(entry);
308                 size_t crc_u64s = extent_entry_u64s(entry);
309
310                 if (!extent_entry_is_crc(entry))
311                         goto next;
312
313                 crc = entry_to_crc(entry);
314
315                 if (next == extent_entry_last(e)) {
316                         /* crc entry with no pointers after it: */
317                         goto drop;
318                 }
319
320                 if (extent_entry_is_crc(next)) {
321                         /* no pointers before next crc entry: */
322                         goto drop;
323                 }
324
325                 if (prev && crc_cmp(crc, prev)) {
326                         /* identical to previous crc entry: */
327                         goto drop;
328                 }
329
330                 if (!prev &&
331                     !crc_csum_type(crc) &&
332                     !crc_compression_type(crc)) {
333                         /* null crc entry: */
334                         bch2_extent_crc_narrow_pointers(e, crc);
335                         goto drop;
336                 }
337
338                 prev = crc;
339 next:
340                 entry = next;
341                 continue;
342 drop:
343                 memmove_u64s_down(crc, next,
344                                   (u64 *) extent_entry_last(e) - (u64 *) next);
345                 e.k->u64s -= crc_u64s;
346         }
347
348         EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
349 }
350
351 static bool should_drop_ptr(const struct bch_fs *c,
352                             struct bkey_s_c_extent e,
353                             const struct bch_extent_ptr *ptr)
354 {
355         return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
356 }
357
358 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
359 {
360         struct bch_extent_ptr *ptr = &e.v->start->ptr;
361         bool dropped = false;
362
363         while ((ptr = extent_ptr_next(e, ptr)))
364                 if (should_drop_ptr(c, e.c, ptr)) {
365                         __bch2_extent_drop_ptr(e, ptr);
366                         dropped = true;
367                 } else
368                         ptr++;
369
370         if (dropped)
371                 bch2_extent_drop_redundant_crcs(e);
372 }
373
374 static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
375                               struct bkey_s k)
376 {
377         return bch2_extent_normalize(c, k);
378 }
379
380 static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
381 {
382         switch (k->type) {
383         case BCH_EXTENT:
384         case BCH_EXTENT_CACHED: {
385                 union bch_extent_entry *entry;
386                 u64 *d = (u64 *) bkeyp_val(f, k);
387                 unsigned i;
388
389                 for (i = 0; i < bkeyp_val_u64s(f, k); i++)
390                         d[i] = swab64(d[i]);
391
392                 for (entry = (union bch_extent_entry *) d;
393                      entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
394                      entry = extent_entry_next(entry)) {
395                         switch (extent_entry_type(entry)) {
396                         case BCH_EXTENT_ENTRY_crc32:
397                                 entry->crc32.csum = swab32(entry->crc32.csum);
398                                 break;
399                         case BCH_EXTENT_ENTRY_crc64:
400                                 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
401                                 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
402                                 break;
403                         case BCH_EXTENT_ENTRY_crc128:
404                                 entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
405                                 entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
406                                 break;
407                         case BCH_EXTENT_ENTRY_ptr:
408                                 break;
409                         }
410                 }
411                 break;
412         }
413         }
414 }
415
416 static const char *extent_ptr_invalid(const struct bch_fs *c,
417                                       struct bkey_s_c_extent e,
418                                       const struct bch_extent_ptr *ptr,
419                                       unsigned size_ondisk,
420                                       bool metadata)
421 {
422         const struct bch_extent_ptr *ptr2;
423         struct bch_dev *ca;
424
425         if (ptr->dev >= c->sb.nr_devices)
426                 return "pointer to invalid device";
427
428         ca = c->devs[ptr->dev];
429         if (!ca)
430                 return "pointer to invalid device";
431
432         extent_for_each_ptr(e, ptr2)
433                 if (ptr != ptr2 && ptr->dev == ptr2->dev)
434                         return "multiple pointers to same device";
435
436         if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
437                 return "offset past end of device";
438
439         if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
440                 return "offset before first bucket";
441
442         if ((ptr->offset & (ca->mi.bucket_size - 1)) +
443             size_ondisk > ca->mi.bucket_size)
444                 return "spans multiple buckets";
445
446         return NULL;
447 }
448
449 static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
450                                 size_t size, struct bkey_s_c_extent e)
451 {
452         char *out = buf, *end = buf + size;
453         const union bch_extent_entry *entry;
454         const union bch_extent_crc *crc;
455         const struct bch_extent_ptr *ptr;
456         struct bch_dev *ca;
457         bool first = true;
458
459 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
460
461         extent_for_each_entry(e, entry) {
462                 if (!first)
463                         p(" ");
464
465                 switch (__extent_entry_type(entry)) {
466                 case BCH_EXTENT_ENTRY_crc32:
467                 case BCH_EXTENT_ENTRY_crc64:
468                 case BCH_EXTENT_ENTRY_crc128:
469                         crc = entry_to_crc(entry);
470
471                         p("crc: c_size %u size %u offset %u csum %u compress %u",
472                           crc_compressed_size(e.k, crc),
473                           crc_uncompressed_size(e.k, crc),
474                           crc_offset(crc), crc_csum_type(crc),
475                           crc_compression_type(crc));
476                         break;
477                 case BCH_EXTENT_ENTRY_ptr:
478                         ptr = entry_to_ptr(entry);
479                         ca = c->devs[ptr->dev];
480
481                         p("ptr: %u:%llu gen %u%s", ptr->dev,
482                           (u64) ptr->offset, ptr->gen,
483                           ca && ptr_stale(ca, ptr)
484                           ? " stale" : "");
485                         break;
486                 default:
487                         p("(invalid extent entry %.16llx)", *((u64 *) entry));
488                         goto out;
489                 }
490
491                 first = false;
492         }
493 out:
494         if (bkey_extent_is_cached(e.k))
495                 p(" cached");
496 #undef p
497         return out - buf;
498 }
499
500 /* Btree ptrs */
501
502 static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
503                                          struct bkey_s_c k)
504 {
505         if (bkey_extent_is_cached(k.k))
506                 return "cached";
507
508         if (k.k->size)
509                 return "nonzero key size";
510
511         if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
512                 return "value too big";
513
514         switch (k.k->type) {
515         case BCH_EXTENT: {
516                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
517                 const union bch_extent_entry *entry;
518                 const struct bch_extent_ptr *ptr;
519                 const union bch_extent_crc *crc;
520                 const char *reason;
521
522                 extent_for_each_entry(e, entry)
523                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
524                                 return "invalid extent entry type";
525
526                 extent_for_each_ptr_crc(e, ptr, crc) {
527                         reason = extent_ptr_invalid(c, e, ptr,
528                                                     c->sb.btree_node_size,
529                                                     true);
530                         if (reason)
531                                 return reason;
532                 }
533
534                 if (crc)
535                         return "has crc field";
536
537                 return NULL;
538         }
539
540         default:
541                 return "invalid value type";
542         }
543 }
544
545 static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
546                                  struct bkey_s_c k)
547 {
548         struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
549         const struct bch_extent_ptr *ptr;
550         unsigned seq;
551         const char *err;
552         char buf[160];
553         struct bucket *g;
554         struct bch_dev *ca;
555         unsigned replicas = 0;
556         bool bad;
557
558         extent_for_each_ptr(e, ptr) {
559                 ca = c->devs[ptr->dev];
560                 g = PTR_BUCKET(ca, ptr);
561                 replicas++;
562
563                 err = "stale";
564                 if (ptr_stale(ca, ptr))
565                         goto err;
566
567                 do {
568                         seq = read_seqcount_begin(&c->gc_pos_lock);
569                         bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
570                                 g->mark.data_type != BUCKET_BTREE;
571                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
572
573                 err = "inconsistent";
574                 if (bad)
575                         goto err;
576         }
577
578         if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
579                 bch2_bkey_val_to_text(c, btree_node_type(b),
580                                      buf, sizeof(buf), k);
581                 bch2_fs_bug(c,
582                         "btree key bad (replicas not marked in superblock):\n%s",
583                         buf);
584                 return;
585         }
586
587         return;
588 err:
589         bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
590         bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
591                       "gen %i last_gc %i mark %08x",
592                       err, buf, PTR_BUCKET_NR(ca, ptr),
593                       PTR_BUCKET(ca, ptr)->mark.gen,
594                       ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
595                       (unsigned) g->mark.counter);
596 }
597
598 static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
599                                   size_t size, struct bkey_s_c k)
600 {
601         char *out = buf, *end = buf + size;
602         const char *invalid;
603
604 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
605
606         if (bkey_extent_is_data(k.k))
607                 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
608
609         invalid = bch2_btree_ptr_invalid(c, k);
610         if (invalid)
611                 p(" invalid: %s", invalid);
612 #undef p
613 }
614
615 struct extent_pick_ptr
616 bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
617 {
618         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
619         const union bch_extent_crc *crc;
620         const struct bch_extent_ptr *ptr;
621         struct extent_pick_ptr pick = { .ca = NULL };
622
623         extent_for_each_ptr_crc(e, ptr, crc) {
624                 struct bch_dev *ca = c->devs[ptr->dev];
625                 struct btree *root = btree_node_root(c, b);
626
627                 if (bch2_fs_inconsistent_on(crc, c,
628                                 "btree node pointer with crc at btree %u level %u/%u bucket %zu",
629                                 b->btree_id, b->level, root ? root->level : -1,
630                                 PTR_BUCKET_NR(ca, ptr)))
631                         break;
632
633                 if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
634                         continue;
635
636                 if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
637                         continue;
638
639                 if (!percpu_ref_tryget(&ca->io_ref))
640                         continue;
641
642                 if (pick.ca)
643                         percpu_ref_put(&pick.ca->io_ref);
644
645                 pick.ca         = ca;
646                 pick.ptr        = *ptr;
647         }
648
649         return pick;
650 }
651
652 const struct bkey_ops bch2_bkey_btree_ops = {
653         .key_invalid    = bch2_btree_ptr_invalid,
654         .key_debugcheck = btree_ptr_debugcheck,
655         .val_to_text    = bch2_btree_ptr_to_text,
656         .swab           = bch2_ptr_swab,
657 };
658
659 /* Extents */
660
661 static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
662 {
663         u64 len = 0;
664
665         if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
666                 return false;
667
668         EBUG_ON(bkey_cmp(where, k.k->p) > 0);
669
670         len = k.k->p.offset - where.offset;
671
672         BUG_ON(len > k.k->size);
673
674         /*
675          * Don't readjust offset if the key size is now 0, because that could
676          * cause offset to point to the next bucket:
677          */
678         if (!len)
679                 __set_bkey_deleted(k.k);
680         else if (bkey_extent_is_data(k.k)) {
681                 struct bkey_s_extent e = bkey_s_to_extent(k);
682                 struct bch_extent_ptr *ptr;
683                 union bch_extent_crc *crc, *prev_crc = NULL;
684
685                 extent_for_each_ptr_crc(e, ptr, crc) {
686                         switch (extent_crc_type(crc)) {
687                         case BCH_EXTENT_CRC_NONE:
688                                 ptr->offset += e.k->size - len;
689                                 break;
690                         case BCH_EXTENT_CRC32:
691                                 if (prev_crc != crc)
692                                         crc->crc32.offset += e.k->size - len;
693                                 break;
694                         case BCH_EXTENT_CRC64:
695                                 if (prev_crc != crc)
696                                         crc->crc64.offset += e.k->size - len;
697                                 break;
698                         case BCH_EXTENT_CRC128:
699                                 if (prev_crc != crc)
700                                         crc->crc128.offset += e.k->size - len;
701                                 break;
702                         }
703                         prev_crc = crc;
704                 }
705         }
706
707         k.k->size = len;
708
709         return true;
710 }
711
712 bool bch2_cut_front(struct bpos where, struct bkey_i *k)
713 {
714         return __bch2_cut_front(where, bkey_i_to_s(k));
715 }
716
717 bool bch2_cut_back(struct bpos where, struct bkey *k)
718 {
719         u64 len = 0;
720
721         if (bkey_cmp(where, k->p) >= 0)
722                 return false;
723
724         EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
725
726         len = where.offset - bkey_start_offset(k);
727
728         BUG_ON(len > k->size);
729
730         k->p = where;
731         k->size = len;
732
733         if (!len)
734                 __set_bkey_deleted(k);
735
736         return true;
737 }
738
739 /**
740  * bch_key_resize - adjust size of @k
741  *
742  * bkey_start_offset(k) will be preserved, modifies where the extent ends
743  */
744 void bch2_key_resize(struct bkey *k,
745                     unsigned new_size)
746 {
747         k->p.offset -= k->size;
748         k->p.offset += new_size;
749         k->size = new_size;
750 }
751
752 /*
753  * In extent_sort_fix_overlapping(), insert_fixup_extent(),
754  * extent_merge_inline() - we're modifying keys in place that are packed. To do
755  * that we have to unpack the key, modify the unpacked key - then this
756  * copies/repacks the unpacked to the original as necessary.
757  */
758 static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
759                           struct bkey_packed *dst, struct bkey *src)
760 {
761         struct bkey_format *f = &b->format;
762         struct bkey_i *dst_unpacked;
763         bool ret;
764
765         if ((dst_unpacked = packed_to_bkey(dst))) {
766                 dst_unpacked->k = *src;
767                 ret = true;
768         } else {
769                 ret = bch2_bkey_pack_key(dst, src, f);
770         }
771
772         if (ret && iter)
773                 bch2_verify_key_order(b, iter, dst);
774
775         return ret;
776 }
777
778 static void extent_save(struct btree *b, struct btree_node_iter *iter,
779                         struct bkey_packed *dst, struct bkey *src)
780 {
781         BUG_ON(!__extent_save(b, iter, dst, src));
782 }
783
784 /*
785  * If keys compare equal, compare by pointer order:
786  *
787  * Necessary for sort_fix_overlapping() - if there are multiple keys that
788  * compare equal in different sets, we have to process them newest to oldest.
789  */
790 #define extent_sort_cmp(h, l, r)                                        \
791 ({                                                                      \
792         struct bkey _ul = bkey_unpack_key(b,                            \
793                                 __btree_node_offset_to_key(b, (l).k));  \
794         struct bkey _ur = bkey_unpack_key(b,                            \
795                                 __btree_node_offset_to_key(b, (r).k));  \
796                                                                         \
797         bkey_cmp(bkey_start_pos(&_ul),                                  \
798                  bkey_start_pos(&_ur)) ?: (r).k - (l).k;                \
799 })
800
801 static inline void extent_sort_sift(struct btree_node_iter *iter,
802                                     struct btree *b, size_t i)
803 {
804         heap_sift_down(iter, i, extent_sort_cmp);
805 }
806
807 static inline void extent_sort_next(struct btree_node_iter *iter,
808                                     struct btree *b,
809                                     struct btree_node_iter_set *i)
810 {
811         sort_key_next(iter, b, i);
812         heap_sift_down(iter, i - iter->data, extent_sort_cmp);
813 }
814
815 static void extent_sort_append(struct bch_fs *c,
816                                struct btree *b,
817                                struct btree_nr_keys *nr,
818                                struct bkey_packed *start,
819                                struct bkey_packed **prev,
820                                struct bkey_packed *k)
821 {
822         struct bkey_format *f = &b->format;
823         BKEY_PADDED(k) tmp;
824
825         if (bkey_whiteout(k))
826                 return;
827
828         bch2_bkey_unpack(b, &tmp.k, k);
829
830         if (*prev &&
831             bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
832                 return;
833
834         if (*prev) {
835                 bch2_bkey_pack(*prev, (void *) *prev, f);
836
837                 btree_keys_account_key_add(nr, 0, *prev);
838                 *prev = bkey_next(*prev);
839         } else {
840                 *prev = start;
841         }
842
843         bkey_copy(*prev, &tmp.k);
844 }
845
846 struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
847                                         struct bset *dst,
848                                         struct btree *b,
849                                         struct btree_node_iter *iter)
850 {
851         struct bkey_format *f = &b->format;
852         struct btree_node_iter_set *_l = iter->data, *_r;
853         struct bkey_packed *prev = NULL, *out, *lk, *rk;
854         struct bkey l_unpacked, r_unpacked;
855         struct bkey_s l, r;
856         struct btree_nr_keys nr;
857
858         memset(&nr, 0, sizeof(nr));
859
860         heap_resort(iter, extent_sort_cmp);
861
862         while (!bch2_btree_node_iter_end(iter)) {
863                 lk = __btree_node_offset_to_key(b, _l->k);
864
865                 if (iter->used == 1) {
866                         extent_sort_append(c, b, &nr, dst->start, &prev, lk);
867                         extent_sort_next(iter, b, _l);
868                         continue;
869                 }
870
871                 _r = iter->data + 1;
872                 if (iter->used > 2 &&
873                     extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
874                         _r++;
875
876                 rk = __btree_node_offset_to_key(b, _r->k);
877
878                 l = __bkey_disassemble(b, lk, &l_unpacked);
879                 r = __bkey_disassemble(b, rk, &r_unpacked);
880
881                 /* If current key and next key don't overlap, just append */
882                 if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
883                         extent_sort_append(c, b, &nr, dst->start, &prev, lk);
884                         extent_sort_next(iter, b, _l);
885                         continue;
886                 }
887
888                 /* Skip 0 size keys */
889                 if (!r.k->size) {
890                         extent_sort_next(iter, b, _r);
891                         continue;
892                 }
893
894                 /*
895                  * overlap: keep the newer key and trim the older key so they
896                  * don't overlap. comparing pointers tells us which one is
897                  * newer, since the bsets are appended one after the other.
898                  */
899
900                 /* can't happen because of comparison func */
901                 BUG_ON(_l->k < _r->k &&
902                        !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
903
904                 if (_l->k > _r->k) {
905                         /* l wins, trim r */
906                         if (bkey_cmp(l.k->p, r.k->p) >= 0) {
907                                 sort_key_next(iter, b, _r);
908                         } else {
909                                 __bch2_cut_front(l.k->p, r);
910                                 extent_save(b, NULL, rk, r.k);
911                         }
912
913                         extent_sort_sift(iter, b, _r - iter->data);
914                 } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
915                         BKEY_PADDED(k) tmp;
916
917                         /*
918                          * r wins, but it overlaps in the middle of l - split l:
919                          */
920                         bkey_reassemble(&tmp.k, l.s_c);
921                         bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
922
923                         __bch2_cut_front(r.k->p, l);
924                         extent_save(b, NULL, lk, l.k);
925
926                         extent_sort_sift(iter, b, 0);
927
928                         extent_sort_append(c, b, &nr, dst->start, &prev,
929                                            bkey_to_packed(&tmp.k));
930                 } else {
931                         bch2_cut_back(bkey_start_pos(r.k), l.k);
932                         extent_save(b, NULL, lk, l.k);
933                 }
934         }
935
936         if (prev) {
937                 bch2_bkey_pack(prev, (void *) prev, f);
938                 btree_keys_account_key_add(&nr, 0, prev);
939                 out = bkey_next(prev);
940         } else {
941                 out = dst->start;
942         }
943
944         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
945         return nr;
946 }
947
948 struct extent_insert_state {
949         struct btree_insert             *trans;
950         struct btree_insert_entry       *insert;
951         struct bpos                     committed;
952         struct bch_fs_usage             stats;
953
954         /* for deleting: */
955         struct bkey_i                   whiteout;
956         bool                            do_journal;
957         bool                            deleting;
958 };
959
960 static void bch2_add_sectors(struct extent_insert_state *s,
961                             struct bkey_s_c k, u64 offset, s64 sectors)
962 {
963         struct bch_fs *c = s->trans->c;
964         struct btree *b = s->insert->iter->nodes[0];
965
966         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
967
968         if (!sectors)
969                 return;
970
971         bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
972                      &s->stats, s->trans->journal_res.seq);
973 }
974
975 static void bch2_subtract_sectors(struct extent_insert_state *s,
976                                  struct bkey_s_c k, u64 offset, s64 sectors)
977 {
978         bch2_add_sectors(s, k, offset, -sectors);
979 }
980
981 /* These wrappers subtract exactly the sectors that we're removing from @k */
982 static void bch2_cut_subtract_back(struct extent_insert_state *s,
983                                   struct bpos where, struct bkey_s k)
984 {
985         bch2_subtract_sectors(s, k.s_c, where.offset,
986                              k.k->p.offset - where.offset);
987         bch2_cut_back(where, k.k);
988 }
989
990 static void bch2_cut_subtract_front(struct extent_insert_state *s,
991                                    struct bpos where, struct bkey_s k)
992 {
993         bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
994                              where.offset - bkey_start_offset(k.k));
995         __bch2_cut_front(where, k);
996 }
997
998 static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
999 {
1000         if (k.k->size)
1001                 bch2_subtract_sectors(s, k.s_c,
1002                                      bkey_start_offset(k.k), k.k->size);
1003         k.k->size = 0;
1004         __set_bkey_deleted(k.k);
1005 }
1006
1007 static bool bch2_extent_merge_inline(struct bch_fs *,
1008                                      struct btree_iter *,
1009                                      struct bkey_packed *,
1010                                      struct bkey_packed *,
1011                                      bool);
1012
1013 #define MAX_LOCK_HOLD_TIME      (5 * NSEC_PER_MSEC)
1014
1015 static enum btree_insert_ret
1016 extent_insert_should_stop(struct extent_insert_state *s)
1017 {
1018         struct btree *b = s->insert->iter->nodes[0];
1019
1020         /*
1021          * Check if we have sufficient space in both the btree node and the
1022          * journal reservation:
1023          *
1024          * Each insert checks for room in the journal entry, but we check for
1025          * room in the btree node up-front. In the worst case, bkey_cmpxchg()
1026          * will insert two keys, and one iteration of this room will insert one
1027          * key, so we need room for three keys.
1028          */
1029         if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
1030                 return BTREE_INSERT_BTREE_NODE_FULL;
1031         else if (!journal_res_insert_fits(s->trans, s->insert))
1032                 return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
1033         else
1034                 return BTREE_INSERT_OK;
1035 }
1036
1037 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
1038                                struct bkey_i *insert)
1039 {
1040         struct btree *b = iter->nodes[0];
1041         struct btree_node_iter *node_iter = &iter->node_iters[0];
1042         struct bset_tree *t = bset_tree_last(b);
1043         struct bkey_packed *where =
1044                 bch2_btree_node_iter_bset_pos(node_iter, b, t);
1045         struct bkey_packed *prev = bch2_bkey_prev(b, t, where);
1046         struct bkey_packed *next_live_key = where;
1047         unsigned clobber_u64s;
1048
1049         if (prev)
1050                 where = bkey_next(prev);
1051
1052         while (next_live_key != btree_bkey_last(b, t) &&
1053                bkey_deleted(next_live_key))
1054                 next_live_key = bkey_next(next_live_key);
1055
1056         /*
1057          * Everything between where and next_live_key is now deleted keys, and
1058          * is overwritten:
1059          */
1060         clobber_u64s = (u64 *) next_live_key - (u64 *) where;
1061
1062         if (prev &&
1063             bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
1064                 goto drop_deleted_keys;
1065
1066         if (next_live_key != btree_bkey_last(b, t) &&
1067             bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
1068                                     next_live_key, false))
1069                 goto drop_deleted_keys;
1070
1071         bch2_bset_insert(b, node_iter, where, insert, clobber_u64s);
1072         bch2_btree_node_iter_fix(iter, b, node_iter, t, where,
1073                                 clobber_u64s, where->u64s);
1074         return;
1075 drop_deleted_keys:
1076         bch2_bset_delete(b, where, clobber_u64s);
1077         bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
1078 }
1079
1080 static void extent_insert_committed(struct extent_insert_state *s)
1081 {
1082         struct bch_fs *c = s->trans->c;
1083         struct btree_iter *iter = s->insert->iter;
1084         struct bkey_i *insert = !s->deleting
1085                 ? s->insert->k
1086                 : &s->whiteout;
1087         BKEY_PADDED(k) split;
1088
1089         EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
1090         EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
1091
1092         if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
1093                 return;
1094
1095         if (s->deleting && !s->do_journal) {
1096                 bch2_cut_front(s->committed, insert);
1097                 goto done;
1098         }
1099
1100         EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
1101
1102         bkey_copy(&split.k, insert);
1103
1104         if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
1105             bkey_cmp(s->committed, insert->k.p) &&
1106             bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
1107                 /* XXX: possibly need to increase our reservation? */
1108                 bch2_cut_subtract_back(s, s->committed,
1109                                       bkey_i_to_s(&split.k));
1110                 bch2_cut_front(s->committed, insert);
1111                 bch2_add_sectors(s, bkey_i_to_s_c(insert),
1112                                 bkey_start_offset(&insert->k),
1113                                 insert->k.size);
1114         } else {
1115                 bch2_cut_back(s->committed, &split.k.k);
1116                 bch2_cut_front(s->committed, insert);
1117         }
1118
1119         if (debug_check_bkeys(c))
1120                 bch2_bkey_debugcheck(c, iter->nodes[iter->level],
1121                                 bkey_i_to_s_c(&split.k));
1122
1123         bch2_btree_journal_key(s->trans, iter, &split.k);
1124
1125         if (!s->deleting)
1126                 extent_bset_insert(c, iter, &split.k);
1127 done:
1128         bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
1129
1130         insert->k.needs_whiteout        = false;
1131         s->do_journal                   = false;
1132         s->trans->did_work              = true;
1133 }
1134
1135 static enum extent_insert_hook_ret
1136 __extent_insert_advance_pos(struct extent_insert_state *s,
1137                             struct bpos next_pos,
1138                             struct bkey_s_c k)
1139 {
1140         struct extent_insert_hook *hook = s->trans->hook;
1141         enum extent_insert_hook_ret ret;
1142 #if 0
1143         /*
1144          * Currently disabled for encryption - broken with fcollapse. Will have
1145          * to reenable when versions are exposed for send/receive - versions
1146          * will have to be monotonic then:
1147          */
1148         if (k.k && k.k->size &&
1149             !bversion_zero(s->insert->k->k.version) &&
1150             bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
1151                 ret = BTREE_HOOK_NO_INSERT;
1152         } else
1153 #endif
1154         if (hook)
1155                 ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
1156         else
1157                 ret = BTREE_HOOK_DO_INSERT;
1158
1159         EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
1160
1161         switch (ret) {
1162         case BTREE_HOOK_DO_INSERT:
1163                 break;
1164         case BTREE_HOOK_NO_INSERT:
1165                 extent_insert_committed(s);
1166                 bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
1167
1168                 bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
1169                 break;
1170         case BTREE_HOOK_RESTART_TRANS:
1171                 return ret;
1172         }
1173
1174         s->committed = next_pos;
1175         return ret;
1176 }
1177
1178 /*
1179  * Update iter->pos, marking how much of @insert we've processed, and call hook
1180  * fn:
1181  */
1182 static enum extent_insert_hook_ret
1183 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
1184 {
1185         struct btree *b = s->insert->iter->nodes[0];
1186         struct bpos next_pos = bpos_min(s->insert->k->k.p,
1187                                         k.k ? k.k->p : b->key.k.p);
1188
1189         /* hole? */
1190         if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
1191                 bool have_uncommitted = bkey_cmp(s->committed,
1192                                 bkey_start_pos(&s->insert->k->k)) > 0;
1193
1194                 switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
1195                                                     bkey_s_c_null)) {
1196                 case BTREE_HOOK_DO_INSERT:
1197                         break;
1198                 case BTREE_HOOK_NO_INSERT:
1199                         /*
1200                          * we had to split @insert and insert the committed
1201                          * part - need to bail out and recheck journal
1202                          * reservation/btree node before we advance pos past @k:
1203                          */
1204                         if (have_uncommitted)
1205                                 return BTREE_HOOK_NO_INSERT;
1206                         break;
1207                 case BTREE_HOOK_RESTART_TRANS:
1208                         return BTREE_HOOK_RESTART_TRANS;
1209                 }
1210         }
1211
1212         /* avoid redundant calls to hook fn: */
1213         if (!bkey_cmp(s->committed, next_pos))
1214                 return BTREE_HOOK_DO_INSERT;
1215
1216         return __extent_insert_advance_pos(s, next_pos, k);
1217 }
1218
1219 static enum btree_insert_ret
1220 extent_insert_check_split_compressed(struct extent_insert_state *s,
1221                                      struct bkey_s_c k,
1222                                      enum bch_extent_overlap overlap)
1223 {
1224         struct bch_fs *c = s->trans->c;
1225         unsigned sectors;
1226
1227         if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
1228             (sectors = bkey_extent_is_compressed(k))) {
1229                 int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
1230
1231                 if (s->trans->flags & BTREE_INSERT_NOFAIL)
1232                         flags |= BCH_DISK_RESERVATION_NOFAIL;
1233
1234                 switch (bch2_disk_reservation_add(c,
1235                                 s->trans->disk_res,
1236                                 sectors, flags)) {
1237                 case 0:
1238                         break;
1239                 case -ENOSPC:
1240                         return BTREE_INSERT_ENOSPC;
1241                 case -EINTR:
1242                         return BTREE_INSERT_NEED_GC_LOCK;
1243                 default:
1244                         BUG();
1245                 }
1246         }
1247
1248         return BTREE_INSERT_OK;
1249 }
1250
1251 static enum btree_insert_ret
1252 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
1253               struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
1254               enum bch_extent_overlap overlap)
1255 {
1256         struct bch_fs *c = s->trans->c;
1257         struct btree_iter *iter = s->insert->iter;
1258         struct btree *b = iter->nodes[0];
1259         struct btree_node_iter *node_iter = &iter->node_iters[0];
1260
1261         switch (overlap) {
1262         case BCH_EXTENT_OVERLAP_FRONT:
1263                 /* insert overlaps with start of k: */
1264                 bch2_cut_subtract_front(s, insert->k.p, k);
1265                 BUG_ON(bkey_deleted(k.k));
1266                 extent_save(b, node_iter, _k, k.k);
1267                 break;
1268
1269         case BCH_EXTENT_OVERLAP_BACK:
1270                 /* insert overlaps with end of k: */
1271                 bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
1272                 BUG_ON(bkey_deleted(k.k));
1273                 extent_save(b, node_iter, _k, k.k);
1274
1275                 /*
1276                  * As the auxiliary tree is indexed by the end of the
1277                  * key and we've just changed the end, update the
1278                  * auxiliary tree.
1279                  */
1280                 bch2_bset_fix_invalidated_key(b, t, _k);
1281                 bch2_btree_node_iter_fix(iter, b, node_iter, t,
1282                                         _k, _k->u64s, _k->u64s);
1283                 break;
1284
1285         case BCH_EXTENT_OVERLAP_ALL: {
1286                 struct bpos orig_pos = k.k->p;
1287
1288                 /* The insert key completely covers k, invalidate k */
1289                 if (!bkey_whiteout(k.k))
1290                         btree_keys_account_key_drop(&b->nr,
1291                                                 t - b->set, _k);
1292
1293                 bch2_drop_subtract(s, k);
1294                 k.k->p = bkey_start_pos(&insert->k);
1295                 if (!__extent_save(b, node_iter, _k, k.k)) {
1296                         /*
1297                          * Couldn't repack: we aren't necessarily able
1298                          * to repack if the new key is outside the range
1299                          * of the old extent, so we have to split
1300                          * @insert:
1301                          */
1302                         k.k->p = orig_pos;
1303                         extent_save(b, node_iter, _k, k.k);
1304
1305                         if (extent_insert_advance_pos(s, k.s_c) ==
1306                             BTREE_HOOK_RESTART_TRANS)
1307                                 return BTREE_INSERT_NEED_TRAVERSE;
1308
1309                         extent_insert_committed(s);
1310                         /*
1311                          * We split and inserted upto at k.k->p - that
1312                          * has to coincide with iter->pos, so that we
1313                          * don't have anything more we have to insert
1314                          * until we recheck our journal reservation:
1315                          */
1316                         EBUG_ON(bkey_cmp(s->committed, k.k->p));
1317                 } else {
1318                         bch2_bset_fix_invalidated_key(b, t, _k);
1319                         bch2_btree_node_iter_fix(iter, b, node_iter, t,
1320                                                 _k, _k->u64s, _k->u64s);
1321                 }
1322
1323                 break;
1324         }
1325         case BCH_EXTENT_OVERLAP_MIDDLE: {
1326                 BKEY_PADDED(k) split;
1327                 /*
1328                  * The insert key falls 'in the middle' of k
1329                  * The insert key splits k in 3:
1330                  * - start only in k, preserve
1331                  * - middle common section, invalidate in k
1332                  * - end only in k, preserve
1333                  *
1334                  * We update the old key to preserve the start,
1335                  * insert will be the new common section,
1336                  * we manually insert the end that we are preserving.
1337                  *
1338                  * modify k _before_ doing the insert (which will move
1339                  * what k points to)
1340                  */
1341                 bkey_reassemble(&split.k, k.s_c);
1342                 split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
1343
1344                 bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
1345                 BUG_ON(bkey_deleted(&split.k.k));
1346
1347                 bch2_cut_subtract_front(s, insert->k.p, k);
1348                 BUG_ON(bkey_deleted(k.k));
1349                 extent_save(b, node_iter, _k, k.k);
1350
1351                 bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
1352                                 bkey_start_offset(&split.k.k),
1353                                 split.k.k.size);
1354                 extent_bset_insert(c, iter, &split.k);
1355                 break;
1356         }
1357         }
1358
1359         return BTREE_INSERT_OK;
1360 }
1361
1362 static enum btree_insert_ret
1363 bch2_delete_fixup_extent(struct extent_insert_state *s)
1364 {
1365         struct bch_fs *c = s->trans->c;
1366         struct btree_iter *iter = s->insert->iter;
1367         struct btree *b = iter->nodes[0];
1368         struct btree_node_iter *node_iter = &iter->node_iters[0];
1369         struct bkey_packed *_k;
1370         struct bkey unpacked;
1371         struct bkey_i *insert = s->insert->k;
1372         enum btree_insert_ret ret = BTREE_INSERT_OK;
1373
1374         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1375
1376         s->whiteout     = *insert;
1377         s->do_journal   = false;
1378
1379         while (bkey_cmp(s->committed, insert->k.p) < 0 &&
1380                (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
1381                (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
1382                 struct bset_tree *t = bch2_bkey_to_bset(b, _k);
1383                 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1384                 enum bch_extent_overlap overlap;
1385
1386                 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1387                 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1388
1389                 if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
1390                         break;
1391
1392                 if (bkey_whiteout(k.k)) {
1393                         s->committed = bpos_min(insert->k.p, k.k->p);
1394                         goto next;
1395                 }
1396
1397                 overlap = bch2_extent_overlap(&insert->k, k.k);
1398
1399                 ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
1400                 if (ret != BTREE_INSERT_OK)
1401                         goto stop;
1402
1403                 switch (extent_insert_advance_pos(s, k.s_c)) {
1404                 case BTREE_HOOK_DO_INSERT:
1405                         break;
1406                 case BTREE_HOOK_NO_INSERT:
1407                         continue;
1408                 case BTREE_HOOK_RESTART_TRANS:
1409                         ret = BTREE_INSERT_NEED_TRAVERSE;
1410                         goto stop;
1411                 }
1412
1413                 s->do_journal = true;
1414
1415                 if (overlap == BCH_EXTENT_OVERLAP_ALL) {
1416                         btree_keys_account_key_drop(&b->nr,
1417                                                 t - b->set, _k);
1418                         bch2_subtract_sectors(s, k.s_c,
1419                                              bkey_start_offset(k.k), k.k->size);
1420                         _k->type = KEY_TYPE_DISCARD;
1421                         reserve_whiteout(b, t, _k);
1422                 } else if (k.k->needs_whiteout ||
1423                            bset_written(b, bset(b, t))) {
1424                         struct bkey_i discard = *insert;
1425
1426                         switch (overlap) {
1427                         case BCH_EXTENT_OVERLAP_FRONT:
1428                                 bch2_cut_front(bkey_start_pos(k.k), &discard);
1429                                 break;
1430                         case BCH_EXTENT_OVERLAP_BACK:
1431                                 bch2_cut_back(k.k->p, &discard.k);
1432                                 break;
1433                         default:
1434                                 break;
1435                         }
1436
1437                         discard.k.needs_whiteout = true;
1438
1439                         ret = extent_squash(s, insert, t, _k, k, overlap);
1440                         BUG_ON(ret != BTREE_INSERT_OK);
1441
1442                         extent_bset_insert(c, iter, &discard);
1443                 } else {
1444                         ret = extent_squash(s, insert, t, _k, k, overlap);
1445                         BUG_ON(ret != BTREE_INSERT_OK);
1446                 }
1447 next:
1448                 bch2_cut_front(s->committed, insert);
1449                 bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
1450         }
1451
1452         if (bkey_cmp(s->committed, insert->k.p) < 0 &&
1453             ret == BTREE_INSERT_OK &&
1454             extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
1455                 ret = BTREE_INSERT_NEED_TRAVERSE;
1456 stop:
1457         extent_insert_committed(s);
1458
1459         bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
1460                            gc_pos_btree_node(b));
1461
1462         EBUG_ON(bkey_cmp(iter->pos, s->committed));
1463         EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
1464                 !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
1465
1466         bch2_cut_front(iter->pos, insert);
1467
1468         if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
1469                 ret = BTREE_INSERT_NEED_TRAVERSE;
1470
1471         EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
1472
1473         return ret;
1474 }
1475
1476 /**
1477  * bch_extent_insert_fixup - insert a new extent and deal with overlaps
1478  *
1479  * this may result in not actually doing the insert, or inserting some subset
1480  * of the insert key. For cmpxchg operations this is where that logic lives.
1481  *
1482  * All subsets of @insert that need to be inserted are inserted using
1483  * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
1484  * returns false, setting @iter->pos for the prefix of @insert that actually got
1485  * inserted.
1486  *
1487  * BSET INVARIANTS: this function is responsible for maintaining all the
1488  * invariants for bsets of extents in memory. things get really hairy with 0
1489  * size extents
1490  *
1491  * within one bset:
1492  *
1493  * bkey_start_pos(bkey_next(k)) >= k
1494  * or bkey_start_offset(bkey_next(k)) >= k->offset
1495  *
1496  * i.e. strict ordering, no overlapping extents.
1497  *
1498  * multiple bsets (i.e. full btree node):
1499  *
1500  * âˆ€ k, j
1501  *   k.size != 0 âˆ§ j.size != 0 â†’
1502  *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
1503  *
1504  * i.e. no two overlapping keys _of nonzero size_
1505  *
1506  * We can't realistically maintain this invariant for zero size keys because of
1507  * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
1508  * there may be another 0 size key between them in another bset, and it will
1509  * thus overlap with the merged key.
1510  *
1511  * In addition, the end of iter->pos indicates how much has been processed.
1512  * If the end of iter->pos is not the same as the end of insert, then
1513  * key insertion needs to continue/be retried.
1514  */
1515 enum btree_insert_ret
1516 bch2_insert_fixup_extent(struct btree_insert *trans,
1517                          struct btree_insert_entry *insert)
1518 {
1519         struct bch_fs *c = trans->c;
1520         struct btree_iter *iter = insert->iter;
1521         struct btree *b = iter->nodes[0];
1522         struct btree_node_iter *node_iter = &iter->node_iters[0];
1523         struct bkey_packed *_k;
1524         struct bkey unpacked;
1525         enum btree_insert_ret ret = BTREE_INSERT_OK;
1526
1527         struct extent_insert_state s = {
1528                 .trans          = trans,
1529                 .insert         = insert,
1530                 .committed      = insert->iter->pos,
1531                 .deleting       = bkey_whiteout(&insert->k->k),
1532         };
1533
1534         EBUG_ON(iter->level);
1535         EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
1536
1537         if (s.deleting)
1538                 return bch2_delete_fixup_extent(&s);
1539
1540         /*
1541          * As we process overlapping extents, we advance @iter->pos both to
1542          * signal to our caller (btree_insert_key()) how much of @insert->k has
1543          * been inserted, and also to keep @iter->pos consistent with
1544          * @insert->k and the node iterator that we're advancing:
1545          */
1546         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1547
1548         if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1549                 bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
1550                                 bkey_start_offset(&insert->k->k),
1551                                 insert->k->k.size);
1552
1553         while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
1554                (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
1555                (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
1556                 struct bset_tree *t = bch2_bkey_to_bset(b, _k);
1557                 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1558                 enum bch_extent_overlap overlap;
1559
1560                 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1561                 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1562
1563                 if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
1564                         break;
1565
1566                 overlap = bch2_extent_overlap(&insert->k->k, k.k);
1567
1568                 ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
1569                 if (ret != BTREE_INSERT_OK)
1570                         goto stop;
1571
1572                 if (!k.k->size)
1573                         goto squash;
1574
1575                 /*
1576                  * Only call advance pos & call hook for nonzero size extents:
1577                  * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
1578                  * overlaps with @k:
1579                  */
1580                 switch (extent_insert_advance_pos(&s, k.s_c)) {
1581                 case BTREE_HOOK_DO_INSERT:
1582                         break;
1583                 case BTREE_HOOK_NO_INSERT:
1584                         continue;
1585                 case BTREE_HOOK_RESTART_TRANS:
1586                         ret = BTREE_INSERT_NEED_TRAVERSE;
1587                         goto stop;
1588                 }
1589
1590                 if (k.k->size &&
1591                     (k.k->needs_whiteout || bset_written(b, bset(b, t))))
1592                         insert->k->k.needs_whiteout = true;
1593
1594                 if (overlap == BCH_EXTENT_OVERLAP_ALL &&
1595                     bkey_whiteout(k.k) &&
1596                     k.k->needs_whiteout) {
1597                         unreserve_whiteout(b, t, _k);
1598                         _k->needs_whiteout = false;
1599                 }
1600 squash:
1601                 ret = extent_squash(&s, insert->k, t, _k, k, overlap);
1602                 if (ret != BTREE_INSERT_OK)
1603                         goto stop;
1604         }
1605
1606         if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
1607             ret == BTREE_INSERT_OK &&
1608             extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
1609                 ret = BTREE_INSERT_NEED_TRAVERSE;
1610 stop:
1611         extent_insert_committed(&s);
1612         /*
1613          * Subtract any remaining sectors from @insert, if we bailed out early
1614          * and didn't fully insert @insert:
1615          */
1616         if (insert->k->k.size &&
1617             !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1618                 bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
1619                                      bkey_start_offset(&insert->k->k),
1620                                      insert->k->k.size);
1621
1622         bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
1623                            gc_pos_btree_node(b));
1624
1625         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1626         EBUG_ON(bkey_cmp(iter->pos, s.committed));
1627         EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
1628                 !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
1629
1630         if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
1631                 ret = BTREE_INSERT_NEED_TRAVERSE;
1632
1633         EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
1634
1635         return ret;
1636 }
1637
1638 static const char *bch2_extent_invalid(const struct bch_fs *c,
1639                                        struct bkey_s_c k)
1640 {
1641         if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
1642                 return "value too big";
1643
1644         if (!k.k->size)
1645                 return "zero key size";
1646
1647         switch (k.k->type) {
1648         case BCH_EXTENT:
1649         case BCH_EXTENT_CACHED: {
1650                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
1651                 const union bch_extent_entry *entry;
1652                 const union bch_extent_crc *crc;
1653                 const struct bch_extent_ptr *ptr;
1654                 unsigned size_ondisk = e.k->size;
1655                 const char *reason;
1656
1657                 extent_for_each_entry(e, entry) {
1658                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
1659                                 return "invalid extent entry type";
1660
1661                         if (extent_entry_is_crc(entry)) {
1662                                 crc = entry_to_crc(entry);
1663
1664                                 if (crc_offset(crc) + e.k->size >
1665                                     crc_uncompressed_size(e.k, crc))
1666                                         return "checksum offset + key size > uncompressed size";
1667
1668                                 size_ondisk = crc_compressed_size(e.k, crc);
1669
1670                                 if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
1671                                         return "invalid checksum type";
1672
1673                                 if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
1674                                         return "invalid compression type";
1675                         } else {
1676                                 ptr = entry_to_ptr(entry);
1677
1678                                 reason = extent_ptr_invalid(c, e, &entry->ptr,
1679                                                             size_ondisk, false);
1680                                 if (reason)
1681                                         return reason;
1682                         }
1683                 }
1684
1685                 return NULL;
1686         }
1687
1688         case BCH_RESERVATION: {
1689                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
1690
1691                 if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
1692                         return "incorrect value size";
1693
1694                 if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
1695                         return "invalid nr_replicas";
1696
1697                 return NULL;
1698         }
1699
1700         default:
1701                 return "invalid value type";
1702         }
1703 }
1704
1705 static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
1706                                           struct bkey_s_c_extent e)
1707 {
1708         const struct bch_extent_ptr *ptr;
1709         struct bch_dev *ca;
1710         struct bucket *g;
1711         unsigned seq, stale;
1712         char buf[160];
1713         bool bad;
1714         unsigned ptrs_per_tier[BCH_TIER_MAX];
1715         unsigned replicas = 0;
1716
1717         /*
1718          * XXX: we should be doing most/all of these checks at startup time,
1719          * where we check bch2_bkey_invalid() in btree_node_read_done()
1720          *
1721          * But note that we can't check for stale pointers or incorrect gc marks
1722          * until after journal replay is done (it might be an extent that's
1723          * going to get overwritten during replay)
1724          */
1725
1726         memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
1727
1728         extent_for_each_ptr(e, ptr) {
1729                 ca = c->devs[ptr->dev];
1730                 g = PTR_BUCKET(ca, ptr);
1731                 replicas++;
1732                 ptrs_per_tier[ca->mi.tier]++;
1733
1734                 /*
1735                  * If journal replay hasn't finished, we might be seeing keys
1736                  * that will be overwritten by the time journal replay is done:
1737                  */
1738                 if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
1739                         continue;
1740
1741                 stale = 0;
1742
1743                 do {
1744                         struct bucket_mark mark;
1745
1746                         seq = read_seqcount_begin(&c->gc_pos_lock);
1747                         mark = READ_ONCE(g->mark);
1748
1749                         /* between mark and bucket gen */
1750                         smp_rmb();
1751
1752                         stale = ptr_stale(ca, ptr);
1753
1754                         bch2_fs_bug_on(stale && !ptr->cached, c,
1755                                          "stale dirty pointer");
1756
1757                         bch2_fs_bug_on(stale > 96, c,
1758                                          "key too stale: %i",
1759                                          stale);
1760
1761                         if (stale)
1762                                 break;
1763
1764                         bad = (mark.data_type != BUCKET_DATA ||
1765                                (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
1766                                 !mark.owned_by_allocator &&
1767                                 !(ptr->cached
1768                                   ? mark.cached_sectors
1769                                   : mark.dirty_sectors)));
1770                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
1771
1772                 if (bad)
1773                         goto bad_ptr;
1774         }
1775
1776         if (replicas > BCH_REPLICAS_MAX) {
1777                 bch2_bkey_val_to_text(c, btree_node_type(b), buf,
1778                                      sizeof(buf), e.s_c);
1779                 bch2_fs_bug(c,
1780                         "extent key bad (too many replicas: %u): %s",
1781                         replicas, buf);
1782                 return;
1783         }
1784
1785         if (!bkey_extent_is_cached(e.k) &&
1786             !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
1787                 bch2_bkey_val_to_text(c, btree_node_type(b),
1788                                      buf, sizeof(buf), e.s_c);
1789                 bch2_fs_bug(c,
1790                         "extent key bad (replicas not marked in superblock):\n%s",
1791                         buf);
1792                 return;
1793         }
1794
1795         return;
1796
1797 bad_ptr:
1798         bch2_bkey_val_to_text(c, btree_node_type(b), buf,
1799                              sizeof(buf), e.s_c);
1800         bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
1801                    "gen %i last_gc %i mark 0x%08x",
1802                    buf, PTR_BUCKET_NR(ca, ptr), PTR_BUCKET(ca, ptr)->mark.gen,
1803                    ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
1804                    (unsigned) g->mark.counter);
1805         return;
1806 }
1807
1808 static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
1809                                    struct bkey_s_c k)
1810 {
1811         switch (k.k->type) {
1812         case BCH_EXTENT:
1813         case BCH_EXTENT_CACHED:
1814                 bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
1815                 break;
1816         case BCH_RESERVATION:
1817                 break;
1818         default:
1819                 BUG();
1820         }
1821 }
1822
1823 static void bch2_extent_to_text(struct bch_fs *c, char *buf,
1824                                 size_t size, struct bkey_s_c k)
1825 {
1826         char *out = buf, *end = buf + size;
1827         const char *invalid;
1828
1829 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
1830
1831         if (bkey_extent_is_data(k.k))
1832                 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
1833
1834         invalid = bch2_extent_invalid(c, k);
1835         if (invalid)
1836                 p(" invalid: %s", invalid);
1837 #undef p
1838 }
1839
1840 static unsigned PTR_TIER(struct bch_fs *c,
1841                          const struct bch_extent_ptr *ptr)
1842 {
1843         return c->devs[ptr->dev]->mi.tier;
1844 }
1845
1846 static void bch2_extent_crc_init(union bch_extent_crc *crc,
1847                                  unsigned compressed_size,
1848                                  unsigned uncompressed_size,
1849                                  unsigned compression_type,
1850                                  unsigned nonce,
1851                                  struct bch_csum csum, unsigned csum_type)
1852 {
1853         if (bch_crc_bytes[csum_type]    <= 4 &&
1854             uncompressed_size           <= CRC32_SIZE_MAX &&
1855             nonce                       <= CRC32_NONCE_MAX) {
1856                 crc->crc32 = (struct bch_extent_crc32) {
1857                         .type = 1 << BCH_EXTENT_ENTRY_crc32,
1858                         ._compressed_size       = compressed_size - 1,
1859                         ._uncompressed_size     = uncompressed_size - 1,
1860                         .offset                 = 0,
1861                         .compression_type       = compression_type,
1862                         .csum_type              = csum_type,
1863                         .csum                   = *((__le32 *) &csum.lo),
1864                 };
1865                 return;
1866         }
1867
1868         if (bch_crc_bytes[csum_type]    <= 10 &&
1869             uncompressed_size           <= CRC64_SIZE_MAX &&
1870             nonce                       <= CRC64_NONCE_MAX) {
1871                 crc->crc64 = (struct bch_extent_crc64) {
1872                         .type = 1 << BCH_EXTENT_ENTRY_crc64,
1873                         ._compressed_size       = compressed_size - 1,
1874                         ._uncompressed_size     = uncompressed_size - 1,
1875                         .offset                 = 0,
1876                         .nonce                  = nonce,
1877                         .compression_type       = compression_type,
1878                         .csum_type              = csum_type,
1879                         .csum_lo                = csum.lo,
1880                         .csum_hi                = *((__le16 *) &csum.hi),
1881                 };
1882                 return;
1883         }
1884
1885         if (bch_crc_bytes[csum_type]    <= 16 &&
1886             uncompressed_size           <= CRC128_SIZE_MAX &&
1887             nonce                       <= CRC128_NONCE_MAX) {
1888                 crc->crc128 = (struct bch_extent_crc128) {
1889                         .type = 1 << BCH_EXTENT_ENTRY_crc128,
1890                         ._compressed_size       = compressed_size - 1,
1891                         ._uncompressed_size     = uncompressed_size - 1,
1892                         .offset                 = 0,
1893                         .nonce                  = nonce,
1894                         .compression_type       = compression_type,
1895                         .csum_type              = csum_type,
1896                         .csum                   = csum,
1897                 };
1898                 return;
1899         }
1900
1901         BUG();
1902 }
1903
1904 void bch2_extent_crc_append(struct bkey_i_extent *e,
1905                             unsigned compressed_size,
1906                             unsigned uncompressed_size,
1907                             unsigned compression_type,
1908                             unsigned nonce,
1909                             struct bch_csum csum, unsigned csum_type)
1910 {
1911         union bch_extent_crc *crc;
1912
1913         BUG_ON(compressed_size > uncompressed_size);
1914         BUG_ON(uncompressed_size != e->k.size);
1915         BUG_ON(!compressed_size || !uncompressed_size);
1916
1917         /*
1918          * Look up the last crc entry, so we can check if we need to add
1919          * another:
1920          */
1921         extent_for_each_crc(extent_i_to_s(e), crc)
1922                 ;
1923
1924         if (!crc && !csum_type && !compression_type)
1925                 return;
1926
1927         if (crc &&
1928             crc_compressed_size(&e->k, crc)     == compressed_size &&
1929             crc_uncompressed_size(&e->k, crc)   == uncompressed_size &&
1930             crc_offset(crc)                     == 0 &&
1931             crc_nonce(crc)                      == nonce &&
1932             crc_csum_type(crc)                  == csum_type &&
1933             crc_compression_type(crc)           == compression_type &&
1934             crc_csum(crc).lo                    == csum.lo &&
1935             crc_csum(crc).hi                    == csum.hi)
1936                 return;
1937
1938         bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
1939                             compressed_size,
1940                             uncompressed_size,
1941                             compression_type,
1942                             nonce, csum, csum_type);
1943         __extent_entry_push(e);
1944 }
1945
1946 /*
1947  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
1948  *
1949  * Returns true if @k should be dropped entirely
1950  *
1951  * For existing keys, only called when btree nodes are being rewritten, not when
1952  * they're merely being compacted/resorted in memory.
1953  */
1954 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1955 {
1956         struct bkey_s_extent e;
1957
1958         switch (k.k->type) {
1959         case KEY_TYPE_ERROR:
1960                 return false;
1961
1962         case KEY_TYPE_DELETED:
1963         case KEY_TYPE_COOKIE:
1964                 return true;
1965
1966         case KEY_TYPE_DISCARD:
1967                 return bversion_zero(k.k->version);
1968
1969         case BCH_EXTENT:
1970         case BCH_EXTENT_CACHED:
1971                 e = bkey_s_to_extent(k);
1972
1973                 bch2_extent_drop_stale(c, e);
1974
1975                 if (!bkey_val_u64s(e.k)) {
1976                         if (bkey_extent_is_cached(e.k)) {
1977                                 k.k->type = KEY_TYPE_DISCARD;
1978                                 if (bversion_zero(k.k->version))
1979                                         return true;
1980                         } else {
1981                                 k.k->type = KEY_TYPE_ERROR;
1982                         }
1983                 }
1984
1985                 return false;
1986         case BCH_RESERVATION:
1987                 return false;
1988         default:
1989                 BUG();
1990         }
1991 }
1992
1993 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
1994                                       struct bkey_s_extent e,
1995                                       unsigned nr_cached)
1996 {
1997         struct bch_extent_ptr *ptr;
1998         bool have_higher_tier;
1999         unsigned tier = 0;
2000
2001         if (!nr_cached)
2002                 return;
2003
2004         do {
2005                 have_higher_tier = false;
2006
2007                 extent_for_each_ptr(e, ptr) {
2008                         if (!ptr->cached &&
2009                             PTR_TIER(c, ptr) == tier) {
2010                                 ptr->cached = true;
2011                                 nr_cached--;
2012                                 if (!nr_cached)
2013                                         return;
2014                         }
2015
2016                         if (PTR_TIER(c, ptr) > tier)
2017                                 have_higher_tier = true;
2018                 }
2019
2020                 tier++;
2021         } while (have_higher_tier);
2022 }
2023
2024 /*
2025  * This picks a non-stale pointer, preferabbly from a device other than
2026  * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
2027  * pointers to other devices, it will still pick a pointer from avoid.
2028  * Note that it prefers lowered-numbered pointers to higher-numbered pointers
2029  * as the pointers are sorted by tier, hence preferring pointers to tier 0
2030  * rather than pointers to tier 1.
2031  */
2032 void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
2033                                    struct bch_dev *avoid,
2034                                    struct extent_pick_ptr *ret)
2035 {
2036         struct bkey_s_c_extent e;
2037         const union bch_extent_crc *crc;
2038         const struct bch_extent_ptr *ptr;
2039
2040         switch (k.k->type) {
2041         case KEY_TYPE_DELETED:
2042         case KEY_TYPE_DISCARD:
2043         case KEY_TYPE_COOKIE:
2044                 ret->ca = NULL;
2045                 return;
2046
2047         case KEY_TYPE_ERROR:
2048                 ret->ca = ERR_PTR(-EIO);
2049                 return;
2050
2051         case BCH_EXTENT:
2052         case BCH_EXTENT_CACHED:
2053                 e = bkey_s_c_to_extent(k);
2054                 ret->ca = NULL;
2055
2056                 extent_for_each_ptr_crc(e, ptr, crc) {
2057                         struct bch_dev *ca = c->devs[ptr->dev];
2058
2059                         if (ptr->cached && ptr_stale(ca, ptr))
2060                                 continue;
2061
2062                         if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
2063                                 continue;
2064
2065                         if (ret->ca &&
2066                             (ca == avoid ||
2067                              ret->ca->mi.tier < ca->mi.tier))
2068                                 continue;
2069
2070                         if (!percpu_ref_tryget(&ca->io_ref))
2071                                 continue;
2072
2073                         if (ret->ca)
2074                                 percpu_ref_put(&ret->ca->io_ref);
2075
2076                         *ret = (struct extent_pick_ptr) {
2077                                 .crc = crc_to_128(e.k, crc),
2078                                 .ptr = *ptr,
2079                                 .ca = ca,
2080                         };
2081                 }
2082
2083                 if (!ret->ca && !bkey_extent_is_cached(e.k))
2084                         ret->ca = ERR_PTR(-EIO);
2085                 return;
2086
2087         case BCH_RESERVATION:
2088                 ret->ca = NULL;
2089                 return;
2090
2091         default:
2092                 BUG();
2093         }
2094 }
2095
2096 static enum merge_result bch2_extent_merge(struct bch_fs *c,
2097                                            struct btree *bk,
2098                                            struct bkey_i *l, struct bkey_i *r)
2099 {
2100         struct bkey_s_extent el, er;
2101         union bch_extent_entry *en_l, *en_r;
2102
2103         if (key_merging_disabled(c))
2104                 return BCH_MERGE_NOMERGE;
2105
2106         /*
2107          * Generic header checks
2108          * Assumes left and right are in order
2109          * Left and right must be exactly aligned
2110          */
2111
2112         if (l->k.u64s           != r->k.u64s ||
2113             l->k.type           != r->k.type ||
2114             bversion_cmp(l->k.version, r->k.version) ||
2115             bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
2116                 return BCH_MERGE_NOMERGE;
2117
2118         switch (l->k.type) {
2119         case KEY_TYPE_DELETED:
2120         case KEY_TYPE_DISCARD:
2121         case KEY_TYPE_ERROR:
2122                 /* These types are mergeable, and no val to check */
2123                 break;
2124
2125         case BCH_EXTENT:
2126         case BCH_EXTENT_CACHED:
2127                 el = bkey_i_to_s_extent(l);
2128                 er = bkey_i_to_s_extent(r);
2129
2130                 extent_for_each_entry(el, en_l) {
2131                         struct bch_extent_ptr *lp, *rp;
2132                         unsigned bucket_size;
2133
2134                         en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
2135
2136                         if ((extent_entry_type(en_l) !=
2137                              extent_entry_type(en_r)) ||
2138                             extent_entry_is_crc(en_l))
2139                                 return BCH_MERGE_NOMERGE;
2140
2141                         lp = &en_l->ptr;
2142                         rp = &en_r->ptr;
2143
2144                         if (lp->offset + el.k->size     != rp->offset ||
2145                             lp->dev                     != rp->dev ||
2146                             lp->gen                     != rp->gen)
2147                                 return BCH_MERGE_NOMERGE;
2148
2149                         /* We don't allow extents to straddle buckets: */
2150                         bucket_size = c->devs[lp->dev]->mi.bucket_size;
2151
2152                         if ((lp->offset & ~((u64) bucket_size - 1)) !=
2153                             (rp->offset & ~((u64) bucket_size - 1)))
2154                                 return BCH_MERGE_NOMERGE;
2155                 }
2156
2157                 break;
2158         case BCH_RESERVATION: {
2159                 struct bkey_i_reservation *li = bkey_i_to_reservation(l);
2160                 struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
2161
2162                 if (li->v.generation != ri->v.generation ||
2163                     li->v.nr_replicas != ri->v.nr_replicas)
2164                         return BCH_MERGE_NOMERGE;
2165                 break;
2166         }
2167         default:
2168                 return BCH_MERGE_NOMERGE;
2169         }
2170
2171         l->k.needs_whiteout |= r->k.needs_whiteout;
2172
2173         /* Keys with no pointers aren't restricted to one bucket and could
2174          * overflow KEY_SIZE
2175          */
2176         if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
2177                 bch2_key_resize(&l->k, KEY_SIZE_MAX);
2178                 bch2_cut_front(l->k.p, r);
2179                 return BCH_MERGE_PARTIAL;
2180         }
2181
2182         bch2_key_resize(&l->k, l->k.size + r->k.size);
2183
2184         return BCH_MERGE_MERGE;
2185 }
2186
2187 static void extent_i_save(struct btree *b, struct bkey_packed *dst,
2188                           struct bkey_i *src)
2189 {
2190         struct bkey_format *f = &b->format;
2191         struct bkey_i *dst_unpacked;
2192
2193         BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
2194
2195         /*
2196          * We don't want the bch2_verify_key_order() call in extent_save(),
2197          * because we may be out of order with deleted keys that are about to be
2198          * removed by extent_bset_insert()
2199          */
2200
2201         if ((dst_unpacked = packed_to_bkey(dst)))
2202                 bkey_copy(dst_unpacked, src);
2203         else
2204                 BUG_ON(!bch2_bkey_pack(dst, src, f));
2205 }
2206
2207 static bool extent_merge_one_overlapping(struct btree_iter *iter,
2208                                          struct bpos new_pos,
2209                                          struct bset_tree *t,
2210                                          struct bkey_packed *k, struct bkey uk,
2211                                          bool check, bool could_pack)
2212 {
2213         struct btree *b = iter->nodes[0];
2214         struct btree_node_iter *node_iter = &iter->node_iters[0];
2215
2216         BUG_ON(!bkey_deleted(k));
2217
2218         if (check) {
2219                 return !bkey_packed(k) || could_pack;
2220         } else {
2221                 uk.p = new_pos;
2222                 extent_save(b, node_iter, k, &uk);
2223                 bch2_bset_fix_invalidated_key(b, t, k);
2224                 bch2_btree_node_iter_fix(iter, b, node_iter, t,
2225                                         k, k->u64s, k->u64s);
2226                 return true;
2227         }
2228 }
2229
2230 static bool extent_merge_do_overlapping(struct btree_iter *iter,
2231                                         struct bkey *m, bool back_merge)
2232 {
2233         struct btree *b = iter->nodes[0];
2234         struct btree_node_iter *node_iter = &iter->node_iters[0];
2235         struct bset_tree *t;
2236         struct bkey_packed *k;
2237         struct bkey uk;
2238         struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
2239         bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
2240         bool check = true;
2241
2242         /*
2243          * @m is the new merged extent:
2244          *
2245          * The merge took place in the last bset; we know there can't be any 0
2246          * size extents overlapping with m there because if so they would have
2247          * been between the two extents we merged.
2248          *
2249          * But in the other bsets, we have to check for and fix such extents:
2250          */
2251 do_fixup:
2252         for_each_bset(b, t) {
2253                 if (t == bset_tree_last(b))
2254                         break;
2255
2256                 /*
2257                  * if we don't find this bset in the iterator we already got to
2258                  * the end of that bset, so start searching from the end.
2259                  */
2260                 k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
2261
2262                 if (k == btree_bkey_last(b, t))
2263                         k = bch2_bkey_prev_all(b, t, k);
2264                 if (!k)
2265                         continue;
2266
2267                 if (back_merge) {
2268                         /*
2269                          * Back merge: 0 size extents will be before the key
2270                          * that was just inserted (and thus the iterator
2271                          * position) - walk backwards to find them
2272                          */
2273                         for (;
2274                              k &&
2275                              (uk = bkey_unpack_key(b, k),
2276                               bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
2277                              k = bch2_bkey_prev_all(b, t, k)) {
2278                                 if (bkey_cmp(uk.p, m->p) >= 0)
2279                                         continue;
2280
2281                                 if (!extent_merge_one_overlapping(iter, new_pos,
2282                                                 t, k, uk, check, could_pack))
2283                                         return false;
2284                         }
2285                 } else {
2286                         /* Front merge - walk forwards */
2287                         for (;
2288                              k != btree_bkey_last(b, t) &&
2289                              (uk = bkey_unpack_key(b, k),
2290                               bkey_cmp(uk.p, m->p) < 0);
2291                              k = bkey_next(k)) {
2292                                 if (bkey_cmp(uk.p,
2293                                              bkey_start_pos(m)) <= 0)
2294                                         continue;
2295
2296                                 if (!extent_merge_one_overlapping(iter, new_pos,
2297                                                 t, k, uk, check, could_pack))
2298                                         return false;
2299                         }
2300                 }
2301         }
2302
2303         if (check) {
2304                 check = false;
2305                 goto do_fixup;
2306         }
2307
2308         return true;
2309 }
2310
2311 /*
2312  * When merging an extent that we're inserting into a btree node, the new merged
2313  * extent could overlap with an existing 0 size extent - if we don't fix that,
2314  * it'll break the btree node iterator so this code finds those 0 size extents
2315  * and shifts them out of the way.
2316  *
2317  * Also unpacks and repacks.
2318  */
2319 static bool bch2_extent_merge_inline(struct bch_fs *c,
2320                                      struct btree_iter *iter,
2321                                      struct bkey_packed *l,
2322                                      struct bkey_packed *r,
2323                                      bool back_merge)
2324 {
2325         struct btree *b = iter->nodes[0];
2326         struct btree_node_iter *node_iter = &iter->node_iters[0];
2327         const struct bkey_format *f = &b->format;
2328         struct bset_tree *t = bset_tree_last(b);
2329         struct bkey_packed *m;
2330         BKEY_PADDED(k) li;
2331         BKEY_PADDED(k) ri;
2332         struct bkey_i *mi;
2333         struct bkey tmp;
2334
2335         /*
2336          * We need to save copies of both l and r, because we might get a
2337          * partial merge (which modifies both) and then fails to repack
2338          */
2339         bch2_bkey_unpack(b, &li.k, l);
2340         bch2_bkey_unpack(b, &ri.k, r);
2341
2342         m = back_merge ? l : r;
2343         mi = back_merge ? &li.k : &ri.k;
2344
2345         /* l & r should be in last bset: */
2346         EBUG_ON(bch2_bkey_to_bset(b, m) != t);
2347
2348         switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
2349         case BCH_MERGE_NOMERGE:
2350                 return false;
2351         case BCH_MERGE_PARTIAL:
2352                 if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
2353                         return false;
2354
2355                 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2356                         return false;
2357
2358                 extent_i_save(b, m, mi);
2359                 bch2_bset_fix_invalidated_key(b, t, m);
2360
2361                 /*
2362                  * Update iterator to reflect what we just inserted - otherwise,
2363                  * the iter_fix() call is going to put us _before_ the key we
2364                  * just partially merged with:
2365                  */
2366                 if (back_merge)
2367                         bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
2368
2369                 bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
2370                                         t, m, m->u64s, m->u64s);
2371
2372                 if (!back_merge)
2373                         bkey_copy(packed_to_bkey(l), &li.k);
2374                 else
2375                         bkey_copy(packed_to_bkey(r), &ri.k);
2376                 return false;
2377         case BCH_MERGE_MERGE:
2378                 if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
2379                         return false;
2380
2381                 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2382                         return false;
2383
2384                 extent_i_save(b, m, &li.k);
2385                 bch2_bset_fix_invalidated_key(b, t, m);
2386
2387                 bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
2388                                         t, m, m->u64s, m->u64s);
2389                 return true;
2390         default:
2391                 BUG();
2392         }
2393 }
2394
2395 const struct bkey_ops bch2_bkey_extent_ops = {
2396         .key_invalid    = bch2_extent_invalid,
2397         .key_debugcheck = bch2_extent_debugcheck,
2398         .val_to_text    = bch2_extent_to_text,
2399         .swab           = bch2_ptr_swab,
2400         .key_normalize  = bch2_ptr_normalize,
2401         .key_merge      = bch2_extent_merge,
2402         .is_extents     = true,
2403 };