]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/extents.c
update bcache sources
[bcachefs-tools-debian] / libbcache / extents.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  *
4  * Code for managing the extent btree and dynamically updating the writeback
5  * dirty sector count.
6  */
7
8 #include "bcache.h"
9 #include "bkey_methods.h"
10 #include "btree_gc.h"
11 #include "btree_update.h"
12 #include "checksum.h"
13 #include "debug.h"
14 #include "dirent.h"
15 #include "error.h"
16 #include "extents.h"
17 #include "inode.h"
18 #include "journal.h"
19 #include "super-io.h"
20 #include "writeback.h"
21 #include "xattr.h"
22
23 #include <trace/events/bcache.h>
24
25 static enum merge_result bch_extent_merge(struct bch_fs *, struct btree *,
26                                           struct bkey_i *, struct bkey_i *);
27
28 static void sort_key_next(struct btree_node_iter *iter,
29                           struct btree *b,
30                           struct btree_node_iter_set *i)
31 {
32         i->k += __btree_node_offset_to_key(b, i->k)->u64s;
33
34         if (i->k == i->end)
35                 *i = iter->data[--iter->used];
36 }
37
38 /*
39  * Returns true if l > r - unless l == r, in which case returns true if l is
40  * older than r.
41  *
42  * Necessary for btree_sort_fixup() - if there are multiple keys that compare
43  * equal in different sets, we have to process them newest to oldest.
44  */
45 #define key_sort_cmp(l, r)                                              \
46 ({                                                                      \
47         int _c = bkey_cmp_packed(b,                                     \
48                                  __btree_node_offset_to_key(b, (l).k),  \
49                                  __btree_node_offset_to_key(b, (r).k)); \
50                                                                         \
51         _c ? _c > 0 : (l).k > (r).k;                                    \
52 })
53
54 static inline bool should_drop_next_key(struct btree_node_iter *iter,
55                                         struct btree *b)
56 {
57         struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
58         struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
59
60         if (bkey_whiteout(k))
61                 return true;
62
63         if (iter->used < 2)
64                 return false;
65
66         if (iter->used > 2 &&
67             key_sort_cmp(r[0], r[1]))
68                 r++;
69
70         /*
71          * key_sort_cmp() ensures that when keys compare equal the older key
72          * comes first; so if l->k compares equal to r->k then l->k is older and
73          * should be dropped.
74          */
75         return !bkey_cmp_packed(b,
76                                 __btree_node_offset_to_key(b, l->k),
77                                 __btree_node_offset_to_key(b, r->k));
78 }
79
80 struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *dst,
81                                                   struct btree *b,
82                                                   struct btree_node_iter *iter)
83 {
84         struct bkey_packed *out = dst->start;
85         struct btree_nr_keys nr;
86
87         memset(&nr, 0, sizeof(nr));
88
89         heap_resort(iter, key_sort_cmp);
90
91         while (!bch_btree_node_iter_end(iter)) {
92                 if (!should_drop_next_key(iter, b)) {
93                         struct bkey_packed *k =
94                                 __btree_node_offset_to_key(b, iter->data->k);
95
96                         bkey_copy(out, k);
97                         btree_keys_account_key_add(&nr, 0, out);
98                         out = bkey_next(out);
99                 }
100
101                 sort_key_next(iter, b, iter->data);
102                 heap_sift(iter, 0, key_sort_cmp);
103         }
104
105         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
106         return nr;
107 }
108
109 /* Common among btree and extent ptrs */
110
111 const struct bch_extent_ptr *
112 bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
113 {
114         const struct bch_extent_ptr *ptr;
115
116         extent_for_each_ptr(e, ptr)
117                 if (ptr->dev == dev)
118                         return ptr;
119
120         return NULL;
121 }
122
123 unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
124 {
125         const struct bch_extent_ptr *ptr;
126         unsigned nr_ptrs = 0;
127
128         extent_for_each_ptr(e, ptr)
129                 nr_ptrs++;
130
131         return nr_ptrs;
132 }
133
134 unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k)
135 {
136         struct bkey_s_c_extent e;
137         const struct bch_extent_ptr *ptr;
138         unsigned nr_ptrs = 0;
139
140         switch (k.k->type) {
141         case BCH_EXTENT:
142         case BCH_EXTENT_CACHED:
143                 e = bkey_s_c_to_extent(k);
144
145                 extent_for_each_ptr(e, ptr)
146                         nr_ptrs += !ptr->cached;
147                 break;
148
149         case BCH_RESERVATION:
150                 nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
151                 break;
152         }
153
154         return nr_ptrs;
155 }
156
157 /* returns true if equal */
158 static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
159 {
160         return extent_crc_type(l) == extent_crc_type(r) &&
161                 !memcmp(l, r, extent_entry_bytes(to_entry(l)));
162 }
163
164 /* Increment pointers after @crc by crc's offset until the next crc entry: */
165 void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
166 {
167         union bch_extent_entry *entry;
168
169         extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
170                 if (!extent_entry_is_ptr(entry))
171                         return;
172
173                 entry->ptr.offset += crc_offset(crc);
174         }
175 }
176
177 /*
178  * We're writing another replica for this extent, so while we've got the data in
179  * memory we'll be computing a new checksum for the currently live data.
180  *
181  * If there are other replicas we aren't moving, and they are checksummed but
182  * not compressed, we can modify them to point to only the data that is
183  * currently live (so that readers won't have to bounce) while we've got the
184  * checksum we need:
185  *
186  * XXX: to guard against data being corrupted while in memory, instead of
187  * recomputing the checksum here, it would be better in the read path to instead
188  * of computing the checksum of the entire extent:
189  *
190  * | extent                              |
191  *
192  * compute the checksums of the live and dead data separately
193  * | dead data || live data || dead data |
194  *
195  * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
196  * use crc_live here (that we verified was correct earlier)
197  *
198  * note: doesn't work with encryption
199  */
200 void bch_extent_narrow_crcs(struct bkey_s_extent e)
201 {
202         union bch_extent_crc *crc;
203         bool have_wide = false, have_narrow = false;
204         struct bch_csum csum = { 0 };
205         unsigned csum_type = 0;
206
207         extent_for_each_crc(e, crc) {
208                 if (crc_compression_type(crc) ||
209                     bch_csum_type_is_encryption(crc_csum_type(crc)))
210                         continue;
211
212                 if (crc_uncompressed_size(e.k, crc) != e.k->size) {
213                         have_wide = true;
214                 } else {
215                         have_narrow = true;
216                         csum = crc_csum(crc);
217                         csum_type = crc_csum_type(crc);
218                 }
219         }
220
221         if (!have_wide || !have_narrow)
222                 return;
223
224         extent_for_each_crc(e, crc) {
225                 if (crc_compression_type(crc))
226                         continue;
227
228                 if (crc_uncompressed_size(e.k, crc) != e.k->size) {
229                         switch (extent_crc_type(crc)) {
230                         case BCH_EXTENT_CRC_NONE:
231                                 BUG();
232                         case BCH_EXTENT_CRC32:
233                                 if (bch_crc_bytes[csum_type] > 4)
234                                         continue;
235
236                                 bch_extent_crc_narrow_pointers(e, crc);
237                                 crc->crc32._compressed_size     = e.k->size - 1;
238                                 crc->crc32._uncompressed_size   = e.k->size - 1;
239                                 crc->crc32.offset               = 0;
240                                 crc->crc32.csum_type            = csum_type;
241                                 crc->crc32.csum                 = csum.lo;
242                                 break;
243                         case BCH_EXTENT_CRC64:
244                                 if (bch_crc_bytes[csum_type] > 10)
245                                         continue;
246
247                                 bch_extent_crc_narrow_pointers(e, crc);
248                                 crc->crc64._compressed_size     = e.k->size - 1;
249                                 crc->crc64._uncompressed_size   = e.k->size - 1;
250                                 crc->crc64.offset               = 0;
251                                 crc->crc64.csum_type            = csum_type;
252                                 crc->crc64.csum_lo              = csum.lo;
253                                 crc->crc64.csum_hi              = csum.hi;
254                                 break;
255                         case BCH_EXTENT_CRC128:
256                                 if (bch_crc_bytes[csum_type] > 16)
257                                         continue;
258
259                                 bch_extent_crc_narrow_pointers(e, crc);
260                                 crc->crc128._compressed_size    = e.k->size - 1;
261                                 crc->crc128._uncompressed_size  = e.k->size - 1;
262                                 crc->crc128.offset              = 0;
263                                 crc->crc128.csum_type           = csum_type;
264                                 crc->crc128.csum                = csum;
265                                 break;
266                         }
267                 }
268         }
269 }
270
271 void bch_extent_drop_redundant_crcs(struct bkey_s_extent e)
272 {
273         union bch_extent_entry *entry = e.v->start;
274         union bch_extent_crc *crc, *prev = NULL;
275
276         while (entry != extent_entry_last(e)) {
277                 union bch_extent_entry *next = extent_entry_next(entry);
278                 size_t crc_u64s = extent_entry_u64s(entry);
279
280                 if (!extent_entry_is_crc(entry))
281                         goto next;
282
283                 crc = entry_to_crc(entry);
284
285                 if (next == extent_entry_last(e)) {
286                         /* crc entry with no pointers after it: */
287                         goto drop;
288                 }
289
290                 if (extent_entry_is_crc(next)) {
291                         /* no pointers before next crc entry: */
292                         goto drop;
293                 }
294
295                 if (prev && crc_cmp(crc, prev)) {
296                         /* identical to previous crc entry: */
297                         goto drop;
298                 }
299
300                 if (!prev &&
301                     !crc_csum_type(crc) &&
302                     !crc_compression_type(crc)) {
303                         /* null crc entry: */
304                         bch_extent_crc_narrow_pointers(e, crc);
305                         goto drop;
306                 }
307
308                 prev = crc;
309 next:
310                 entry = next;
311                 continue;
312 drop:
313                 memmove_u64s_down(crc, next,
314                                   (u64 *) extent_entry_last(e) - (u64 *) next);
315                 e.k->u64s -= crc_u64s;
316         }
317
318         EBUG_ON(bkey_val_u64s(e.k) && !bch_extent_nr_ptrs(e.c));
319 }
320
321 static bool should_drop_ptr(const struct bch_fs *c,
322                             struct bkey_s_c_extent e,
323                             const struct bch_extent_ptr *ptr)
324 {
325         struct bch_dev *ca = c->devs[ptr->dev];
326
327         return ptr_stale(ca, ptr);
328 }
329
330 static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
331 {
332         struct bch_extent_ptr *ptr = &e.v->start->ptr;
333         bool dropped = false;
334
335         while ((ptr = extent_ptr_next(e, ptr)))
336                 if (should_drop_ptr(c, e.c, ptr)) {
337                         __bch_extent_drop_ptr(e, ptr);
338                         dropped = true;
339                 } else
340                         ptr++;
341
342         if (dropped)
343                 bch_extent_drop_redundant_crcs(e);
344 }
345
346 static bool bch_ptr_normalize(struct bch_fs *c, struct btree *bk,
347                               struct bkey_s k)
348 {
349         return bch_extent_normalize(c, k);
350 }
351
352 static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
353 {
354         switch (k->type) {
355         case BCH_EXTENT:
356         case BCH_EXTENT_CACHED: {
357                 union bch_extent_entry *entry;
358                 u64 *d = (u64 *) bkeyp_val(f, k);
359                 unsigned i;
360
361                 for (i = 0; i < bkeyp_val_u64s(f, k); i++)
362                         d[i] = swab64(d[i]);
363
364                 for (entry = (union bch_extent_entry *) d;
365                      entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
366                      entry = extent_entry_next(entry)) {
367                         switch (extent_entry_type(entry)) {
368                         case BCH_EXTENT_ENTRY_crc32:
369                                 entry->crc32.csum = swab32(entry->crc32.csum);
370                                 break;
371                         case BCH_EXTENT_ENTRY_crc64:
372                                 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
373                                 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
374                                 break;
375                         case BCH_EXTENT_ENTRY_crc128:
376                                 entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
377                                 entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
378                                 break;
379                         case BCH_EXTENT_ENTRY_ptr:
380                                 break;
381                         }
382                 }
383                 break;
384         }
385         }
386 }
387
388 static const char *extent_ptr_invalid(const struct bch_fs *c,
389                                       struct bkey_s_c_extent e,
390                                       const struct bch_extent_ptr *ptr,
391                                       unsigned size_ondisk,
392                                       bool metadata)
393 {
394         const struct bch_extent_ptr *ptr2;
395         struct bch_dev *ca;
396
397         if (ptr->dev >= c->sb.nr_devices)
398                 return "pointer to invalid device";
399
400         ca = c->devs[ptr->dev];
401         if (!ca)
402                 return "pointer to invalid device";
403
404         extent_for_each_ptr(e, ptr2)
405                 if (ptr != ptr2 && ptr->dev == ptr2->dev)
406                         return "multiple pointers to same device";
407
408         if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
409                 return "offset past end of device";
410
411         if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
412                 return "offset before first bucket";
413
414         if ((ptr->offset & (ca->mi.bucket_size - 1)) +
415             size_ondisk > ca->mi.bucket_size)
416                 return "spans multiple buckets";
417
418         if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
419                 return "device not marked as containing data";
420
421         return NULL;
422 }
423
424 static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
425                                 size_t size, struct bkey_s_c_extent e)
426 {
427         char *out = buf, *end = buf + size;
428         const union bch_extent_entry *entry;
429         const union bch_extent_crc *crc;
430         const struct bch_extent_ptr *ptr;
431         struct bch_dev *ca;
432         bool first = true;
433
434 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
435
436         extent_for_each_entry(e, entry) {
437                 if (!first)
438                         p(" ");
439
440                 switch (__extent_entry_type(entry)) {
441                 case BCH_EXTENT_ENTRY_crc32:
442                 case BCH_EXTENT_ENTRY_crc64:
443                 case BCH_EXTENT_ENTRY_crc128:
444                         crc = entry_to_crc(entry);
445
446                         p("crc: c_size %u size %u offset %u csum %u compress %u",
447                           crc_compressed_size(e.k, crc),
448                           crc_uncompressed_size(e.k, crc),
449                           crc_offset(crc), crc_csum_type(crc),
450                           crc_compression_type(crc));
451                         break;
452                 case BCH_EXTENT_ENTRY_ptr:
453                         ptr = entry_to_ptr(entry);
454                         ca = c->devs[ptr->dev];
455
456                         p("ptr: %u:%llu gen %u%s", ptr->dev,
457                           (u64) ptr->offset, ptr->gen,
458                           ca && ptr_stale(ca, ptr)
459                           ? " stale" : "");
460                         break;
461                 default:
462                         p("(invalid extent entry %.16llx)", *((u64 *) entry));
463                         goto out;
464                 }
465
466                 first = false;
467         }
468 out:
469         if (bkey_extent_is_cached(e.k))
470                 p(" cached");
471 #undef p
472         return out - buf;
473 }
474
475 /* Btree ptrs */
476
477 static const char *bch_btree_ptr_invalid(const struct bch_fs *c,
478                                          struct bkey_s_c k)
479 {
480         if (bkey_extent_is_cached(k.k))
481                 return "cached";
482
483         if (k.k->size)
484                 return "nonzero key size";
485
486         if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
487                 return "value too big";
488
489         switch (k.k->type) {
490         case BCH_EXTENT: {
491                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
492                 const union bch_extent_entry *entry;
493                 const struct bch_extent_ptr *ptr;
494                 const union bch_extent_crc *crc;
495                 const char *reason;
496
497                 extent_for_each_entry(e, entry)
498                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
499                                 return "invalid extent entry type";
500
501                 extent_for_each_ptr_crc(e, ptr, crc) {
502                         reason = extent_ptr_invalid(c, e, ptr,
503                                                     c->sb.btree_node_size,
504                                                     true);
505                         if (reason)
506                                 return reason;
507                 }
508
509                 if (crc)
510                         return "has crc field";
511
512                 return NULL;
513         }
514
515         default:
516                 return "invalid value type";
517         }
518 }
519
520 static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
521                                  struct bkey_s_c k)
522 {
523         struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
524         const struct bch_extent_ptr *ptr;
525         unsigned seq;
526         const char *err;
527         char buf[160];
528         struct bucket *g;
529         struct bch_dev *ca;
530         unsigned replicas = 0;
531         bool bad;
532
533         extent_for_each_ptr(e, ptr) {
534                 ca = c->devs[ptr->dev];
535                 g = PTR_BUCKET(ca, ptr);
536                 replicas++;
537
538                 err = "stale";
539                 if (ptr_stale(ca, ptr))
540                         goto err;
541
542                 do {
543                         seq = read_seqcount_begin(&c->gc_pos_lock);
544                         bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
545                                 g->mark.data_type != BUCKET_BTREE;
546                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
547
548                 err = "inconsistent";
549                 if (bad)
550                         goto err;
551         }
552
553         if (replicas < c->sb.meta_replicas_have) {
554                 bch_bkey_val_to_text(c, btree_node_type(b),
555                                      buf, sizeof(buf), k);
556                 bch_fs_bug(c,
557                         "btree key bad (too few replicas, %u < %u): %s",
558                         replicas, c->sb.meta_replicas_have, buf);
559                 return;
560         }
561
562         return;
563 err:
564         bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
565         bch_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i "
566                       "gen %i last_gc %i mark %08x",
567                       err, buf, PTR_BUCKET_NR(ca, ptr),
568                       g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
569                       ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
570                       (unsigned) g->mark.counter);
571 }
572
573 static void bch_btree_ptr_to_text(struct bch_fs *c, char *buf,
574                                   size_t size, struct bkey_s_c k)
575 {
576         char *out = buf, *end = buf + size;
577         const char *invalid;
578
579 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
580
581         if (bkey_extent_is_data(k.k))
582                 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
583
584         invalid = bch_btree_ptr_invalid(c, k);
585         if (invalid)
586                 p(" invalid: %s", invalid);
587 #undef p
588 }
589
590 struct extent_pick_ptr
591 bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
592 {
593         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
594         const union bch_extent_crc *crc;
595         const struct bch_extent_ptr *ptr;
596         struct extent_pick_ptr pick = { .ca = NULL };
597
598         extent_for_each_ptr_crc(e, ptr, crc) {
599                 struct bch_dev *ca = c->devs[ptr->dev];
600                 struct btree *root = btree_node_root(c, b);
601
602                 if (bch_fs_inconsistent_on(crc, c,
603                                 "btree node pointer with crc at btree %u level %u/%u bucket %zu",
604                                 b->btree_id, b->level, root ? root->level : -1,
605                                 PTR_BUCKET_NR(ca, ptr)))
606                         break;
607
608                 if (bch_dev_inconsistent_on(ptr_stale(ca, ptr), ca,
609                                 "stale btree node pointer at btree %u level %u/%u bucket %zu",
610                                 b->btree_id, b->level, root ? root->level : -1,
611                                 PTR_BUCKET_NR(ca, ptr)))
612                         continue;
613
614                 if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
615                         continue;
616
617                 if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
618                         continue;
619
620                 if (!percpu_ref_tryget(&ca->io_ref))
621                         continue;
622
623                 if (pick.ca)
624                         percpu_ref_put(&pick.ca->io_ref);
625
626                 pick.ca         = ca;
627                 pick.ptr        = *ptr;
628         }
629
630         return pick;
631 }
632
633 const struct bkey_ops bch_bkey_btree_ops = {
634         .key_invalid    = bch_btree_ptr_invalid,
635         .key_debugcheck = btree_ptr_debugcheck,
636         .val_to_text    = bch_btree_ptr_to_text,
637         .swab           = bch_ptr_swab,
638 };
639
640 /* Extents */
641
642 static bool __bch_cut_front(struct bpos where, struct bkey_s k)
643 {
644         u64 len = 0;
645
646         if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
647                 return false;
648
649         EBUG_ON(bkey_cmp(where, k.k->p) > 0);
650
651         len = k.k->p.offset - where.offset;
652
653         BUG_ON(len > k.k->size);
654
655         /*
656          * Don't readjust offset if the key size is now 0, because that could
657          * cause offset to point to the next bucket:
658          */
659         if (!len)
660                 __set_bkey_deleted(k.k);
661         else if (bkey_extent_is_data(k.k)) {
662                 struct bkey_s_extent e = bkey_s_to_extent(k);
663                 struct bch_extent_ptr *ptr;
664                 union bch_extent_crc *crc, *prev_crc = NULL;
665
666                 extent_for_each_ptr_crc(e, ptr, crc) {
667                         switch (extent_crc_type(crc)) {
668                         case BCH_EXTENT_CRC_NONE:
669                                 ptr->offset += e.k->size - len;
670                                 break;
671                         case BCH_EXTENT_CRC32:
672                                 if (prev_crc != crc)
673                                         crc->crc32.offset += e.k->size - len;
674                                 break;
675                         case BCH_EXTENT_CRC64:
676                                 if (prev_crc != crc)
677                                         crc->crc64.offset += e.k->size - len;
678                                 break;
679                         case BCH_EXTENT_CRC128:
680                                 if (prev_crc != crc)
681                                         crc->crc128.offset += e.k->size - len;
682                                 break;
683                         }
684                         prev_crc = crc;
685                 }
686         }
687
688         k.k->size = len;
689
690         return true;
691 }
692
693 bool bch_cut_front(struct bpos where, struct bkey_i *k)
694 {
695         return __bch_cut_front(where, bkey_i_to_s(k));
696 }
697
698 bool bch_cut_back(struct bpos where, struct bkey *k)
699 {
700         u64 len = 0;
701
702         if (bkey_cmp(where, k->p) >= 0)
703                 return false;
704
705         EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
706
707         len = where.offset - bkey_start_offset(k);
708
709         BUG_ON(len > k->size);
710
711         k->p = where;
712         k->size = len;
713
714         if (!len)
715                 __set_bkey_deleted(k);
716
717         return true;
718 }
719
720 /**
721  * bch_key_resize - adjust size of @k
722  *
723  * bkey_start_offset(k) will be preserved, modifies where the extent ends
724  */
725 void bch_key_resize(struct bkey *k,
726                     unsigned new_size)
727 {
728         k->p.offset -= k->size;
729         k->p.offset += new_size;
730         k->size = new_size;
731 }
732
733 /*
734  * In extent_sort_fix_overlapping(), insert_fixup_extent(),
735  * extent_merge_inline() - we're modifying keys in place that are packed. To do
736  * that we have to unpack the key, modify the unpacked key - then this
737  * copies/repacks the unpacked to the original as necessary.
738  */
739 static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
740                           struct bkey_packed *dst, struct bkey *src)
741 {
742         struct bkey_format *f = &b->format;
743         struct bkey_i *dst_unpacked;
744         bool ret;
745
746         if ((dst_unpacked = packed_to_bkey(dst))) {
747                 dst_unpacked->k = *src;
748                 ret = true;
749         } else {
750                 ret = bkey_pack_key(dst, src, f);
751         }
752
753         if (ret && iter)
754                 bch_verify_key_order(b, iter, dst);
755
756         return ret;
757 }
758
759 static void extent_save(struct btree *b, struct btree_node_iter *iter,
760                         struct bkey_packed *dst, struct bkey *src)
761 {
762         BUG_ON(!__extent_save(b, iter, dst, src));
763 }
764
765 /*
766  * Returns true if l > r - unless l == r, in which case returns true if l is
767  * older than r.
768  *
769  * Necessary for sort_fix_overlapping() - if there are multiple keys that
770  * compare equal in different sets, we have to process them newest to oldest.
771  */
772 #define extent_sort_cmp(l, r)                                           \
773 ({                                                                      \
774         struct bkey _ul = bkey_unpack_key(b,                            \
775                                 __btree_node_offset_to_key(b, (l).k));  \
776         struct bkey _ur = bkey_unpack_key(b,                            \
777                                 __btree_node_offset_to_key(b, (r).k));  \
778                                                                         \
779         int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur));  \
780         _c ? _c > 0 : (l).k < (r).k;                                    \
781 })
782
783 static inline void extent_sort_sift(struct btree_node_iter *iter,
784                                     struct btree *b, size_t i)
785 {
786         heap_sift(iter, i, extent_sort_cmp);
787 }
788
789 static inline void extent_sort_next(struct btree_node_iter *iter,
790                                     struct btree *b,
791                                     struct btree_node_iter_set *i)
792 {
793         sort_key_next(iter, b, i);
794         heap_sift(iter, i - iter->data, extent_sort_cmp);
795 }
796
797 static void extent_sort_append(struct bch_fs *c,
798                                struct btree *b,
799                                struct btree_nr_keys *nr,
800                                struct bkey_packed *start,
801                                struct bkey_packed **prev,
802                                struct bkey_packed *k)
803 {
804         struct bkey_format *f = &b->format;
805         BKEY_PADDED(k) tmp;
806
807         if (bkey_whiteout(k))
808                 return;
809
810         bkey_unpack(b, &tmp.k, k);
811
812         if (*prev &&
813             bch_extent_merge(c, b, (void *) *prev, &tmp.k))
814                 return;
815
816         if (*prev) {
817                 bkey_pack(*prev, (void *) *prev, f);
818
819                 btree_keys_account_key_add(nr, 0, *prev);
820                 *prev = bkey_next(*prev);
821         } else {
822                 *prev = start;
823         }
824
825         bkey_copy(*prev, &tmp.k);
826 }
827
828 struct btree_nr_keys bch_extent_sort_fix_overlapping(struct bch_fs *c,
829                                         struct bset *dst,
830                                         struct btree *b,
831                                         struct btree_node_iter *iter)
832 {
833         struct bkey_format *f = &b->format;
834         struct btree_node_iter_set *_l = iter->data, *_r;
835         struct bkey_packed *prev = NULL, *out, *lk, *rk;
836         struct bkey l_unpacked, r_unpacked;
837         struct bkey_s l, r;
838         struct btree_nr_keys nr;
839
840         memset(&nr, 0, sizeof(nr));
841
842         heap_resort(iter, extent_sort_cmp);
843
844         while (!bch_btree_node_iter_end(iter)) {
845                 lk = __btree_node_offset_to_key(b, _l->k);
846
847                 if (iter->used == 1) {
848                         extent_sort_append(c, b, &nr, dst->start, &prev, lk);
849                         extent_sort_next(iter, b, _l);
850                         continue;
851                 }
852
853                 _r = iter->data + 1;
854                 if (iter->used > 2 &&
855                     extent_sort_cmp(_r[0], _r[1]))
856                         _r++;
857
858                 rk = __btree_node_offset_to_key(b, _r->k);
859
860                 l = __bkey_disassemble(b, lk, &l_unpacked);
861                 r = __bkey_disassemble(b, rk, &r_unpacked);
862
863                 /* If current key and next key don't overlap, just append */
864                 if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
865                         extent_sort_append(c, b, &nr, dst->start, &prev, lk);
866                         extent_sort_next(iter, b, _l);
867                         continue;
868                 }
869
870                 /* Skip 0 size keys */
871                 if (!r.k->size) {
872                         extent_sort_next(iter, b, _r);
873                         continue;
874                 }
875
876                 /*
877                  * overlap: keep the newer key and trim the older key so they
878                  * don't overlap. comparing pointers tells us which one is
879                  * newer, since the bsets are appended one after the other.
880                  */
881
882                 /* can't happen because of comparison func */
883                 BUG_ON(_l->k < _r->k &&
884                        !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
885
886                 if (_l->k > _r->k) {
887                         /* l wins, trim r */
888                         if (bkey_cmp(l.k->p, r.k->p) >= 0) {
889                                 sort_key_next(iter, b, _r);
890                         } else {
891                                 __bch_cut_front(l.k->p, r);
892                                 extent_save(b, NULL, rk, r.k);
893                         }
894
895                         extent_sort_sift(iter, b, _r - iter->data);
896                 } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
897                         BKEY_PADDED(k) tmp;
898
899                         /*
900                          * r wins, but it overlaps in the middle of l - split l:
901                          */
902                         bkey_reassemble(&tmp.k, l.s_c);
903                         bch_cut_back(bkey_start_pos(r.k), &tmp.k.k);
904
905                         __bch_cut_front(r.k->p, l);
906                         extent_save(b, NULL, lk, l.k);
907
908                         extent_sort_sift(iter, b, 0);
909
910                         extent_sort_append(c, b, &nr, dst->start, &prev,
911                                            bkey_to_packed(&tmp.k));
912                 } else {
913                         bch_cut_back(bkey_start_pos(r.k), l.k);
914                         extent_save(b, NULL, lk, l.k);
915                 }
916         }
917
918         if (prev) {
919                 bkey_pack(prev, (void *) prev, f);
920                 btree_keys_account_key_add(&nr, 0, prev);
921                 out = bkey_next(prev);
922         } else {
923                 out = dst->start;
924         }
925
926         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
927         return nr;
928 }
929
930 struct extent_insert_state {
931         struct btree_insert             *trans;
932         struct btree_insert_entry       *insert;
933         struct bpos                     committed;
934         struct bch_fs_usage     stats;
935
936         /* for deleting: */
937         struct bkey_i                   whiteout;
938         bool                            do_journal;
939         bool                            deleting;
940 };
941
942 static void bch_add_sectors(struct extent_insert_state *s,
943                             struct bkey_s_c k, u64 offset, s64 sectors)
944 {
945         struct bch_fs *c = s->trans->c;
946         struct btree *b = s->insert->iter->nodes[0];
947
948         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
949
950         if (!sectors)
951                 return;
952
953         bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
954                      &s->stats, s->trans->journal_res.seq);
955
956         if (bkey_extent_is_data(k.k) &&
957             !bkey_extent_is_cached(k.k))
958                 bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors);
959 }
960
961 static void bch_subtract_sectors(struct extent_insert_state *s,
962                                  struct bkey_s_c k, u64 offset, s64 sectors)
963 {
964         bch_add_sectors(s, k, offset, -sectors);
965 }
966
967 /* These wrappers subtract exactly the sectors that we're removing from @k */
968 static void bch_cut_subtract_back(struct extent_insert_state *s,
969                                   struct bpos where, struct bkey_s k)
970 {
971         bch_subtract_sectors(s, k.s_c, where.offset,
972                              k.k->p.offset - where.offset);
973         bch_cut_back(where, k.k);
974 }
975
976 static void bch_cut_subtract_front(struct extent_insert_state *s,
977                                    struct bpos where, struct bkey_s k)
978 {
979         bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
980                              where.offset - bkey_start_offset(k.k));
981         __bch_cut_front(where, k);
982 }
983
984 static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
985 {
986         if (k.k->size)
987                 bch_subtract_sectors(s, k.s_c,
988                                      bkey_start_offset(k.k), k.k->size);
989         k.k->size = 0;
990         __set_bkey_deleted(k.k);
991 }
992
993 /*
994  * Note: If this returns true because only some pointers matched,
995  * we can lose some caching that had happened in the interim.
996  * Because cache promotion only promotes the part of the extent
997  * actually read, and not the whole extent, and due to the key
998  * splitting done in bch_extent_insert_fixup, preserving such
999  * caching is difficult.
1000  */
1001 static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
1002 {
1003         struct bkey_s_c_extent le, re;
1004         const struct bch_extent_ptr *lp, *rp;
1005         s64 offset;
1006
1007         BUG_ON(!l.k->size || !r.k->size);
1008
1009         if (l.k->type != r.k->type ||
1010             bversion_cmp(l.k->version, r.k->version))
1011                 return false;
1012
1013         switch (l.k->type) {
1014         case KEY_TYPE_COOKIE:
1015                 return !memcmp(bkey_s_c_to_cookie(l).v,
1016                                bkey_s_c_to_cookie(r).v,
1017                                sizeof(struct bch_cookie));
1018
1019         case BCH_EXTENT:
1020         case BCH_EXTENT_CACHED:
1021                 le = bkey_s_c_to_extent(l);
1022                 re = bkey_s_c_to_extent(r);
1023
1024                 /*
1025                  * bkey_cmpxchg() handles partial matches - when either l or r
1026                  * has been trimmed - so we need just to handle l or r not
1027                  * starting at the same place when checking for a match here.
1028                  *
1029                  * If the starts of the keys are different, we just apply that
1030                  * offset to the device pointer offsets when checking those -
1031                  * matching how bch_cut_front() adjusts device pointer offsets
1032                  * when adjusting the start of a key:
1033                  */
1034                 offset = bkey_start_offset(l.k) - bkey_start_offset(r.k);
1035
1036                 /*
1037                  * XXX: perhaps we only raced with copygc or tiering replacing
1038                  * one of the pointers: it should suffice to find _any_ matching
1039                  * pointer
1040                  */
1041
1042                 if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
1043                         return false;
1044
1045                 extent_for_each_ptr(le, lp) {
1046                         const union bch_extent_entry *entry =
1047                                 vstruct_idx(re.v, (u64 *) lp - le.v->_data);
1048
1049                         if (!extent_entry_is_ptr(entry))
1050                                 return false;
1051
1052                         rp = &entry->ptr;
1053
1054                         if (lp->offset  != rp->offset + offset ||
1055                             lp->dev     != rp->dev ||
1056                             lp->gen     != rp->gen)
1057                                 return false;
1058                 }
1059
1060                 return true;
1061         default:
1062                 return false;
1063         }
1064
1065 }
1066
1067 /*
1068  * Returns true on success, false on failure (and false means @new no longer
1069  * overlaps with @k)
1070  *
1071  * If returned true, we may have inserted up to one key in @b.
1072  * If returned false, we may have inserted up to two keys in @b.
1073  *
1074  * On return, there is room in @res for at least one more key of the same size
1075  * as @new.
1076  */
1077 enum extent_insert_hook_ret bch_extent_cmpxchg(struct extent_insert_hook *hook,
1078                                                struct bpos committed_pos,
1079                                                struct bpos next_pos,
1080                                                struct bkey_s_c k,
1081                                                const struct bkey_i *new)
1082 {
1083         struct bch_replace_info *replace = container_of(hook,
1084                                         struct bch_replace_info, hook);
1085         struct bkey_i *old = &replace->key;
1086
1087         EBUG_ON(bkey_cmp(committed_pos, bkey_start_pos(&new->k)) < 0);
1088
1089         /* must have something to compare against */
1090         EBUG_ON(!bkey_val_u64s(&old->k));
1091
1092         /* new must be a subset of old */
1093         EBUG_ON(bkey_cmp(new->k.p, old->k.p) > 0 ||
1094                 bkey_cmp(bkey_start_pos(&new->k), bkey_start_pos(&old->k)) < 0);
1095
1096         if (k.k && bch_extent_cmpxchg_cmp(k, bkey_i_to_s_c(old))) {
1097                 replace->successes++;
1098                 return BTREE_HOOK_DO_INSERT;
1099         } else {
1100                 replace->failures++;
1101                 return BTREE_HOOK_NO_INSERT;
1102         }
1103 }
1104
1105 static bool bch_extent_merge_inline(struct bch_fs *,
1106                                     struct btree_iter *,
1107                                     struct bkey_packed *,
1108                                     struct bkey_packed *,
1109                                     bool);
1110
1111 #define MAX_LOCK_HOLD_TIME      (5 * NSEC_PER_MSEC)
1112
1113 static enum btree_insert_ret
1114 extent_insert_should_stop(struct extent_insert_state *s)
1115 {
1116         struct btree *b = s->insert->iter->nodes[0];
1117
1118         /*
1119          * Check if we have sufficient space in both the btree node and the
1120          * journal reservation:
1121          *
1122          * Each insert checks for room in the journal entry, but we check for
1123          * room in the btree node up-front. In the worst case, bkey_cmpxchg()
1124          * will insert two keys, and one iteration of this room will insert one
1125          * key, so we need room for three keys.
1126          */
1127         if (!bch_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
1128                 return BTREE_INSERT_BTREE_NODE_FULL;
1129         else if (!journal_res_insert_fits(s->trans, s->insert))
1130                 return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
1131         else
1132                 return BTREE_INSERT_OK;
1133 }
1134
1135 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
1136                                struct bkey_i *insert)
1137 {
1138         struct btree *b = iter->nodes[0];
1139         struct btree_node_iter *node_iter = &iter->node_iters[0];
1140         struct bset_tree *t = bset_tree_last(b);
1141         struct bkey_packed *where =
1142                 bch_btree_node_iter_bset_pos(node_iter, b, t);
1143         struct bkey_packed *prev = bkey_prev(b, t, where);
1144         struct bkey_packed *next_live_key = where;
1145         unsigned clobber_u64s;
1146
1147         if (prev)
1148                 where = bkey_next(prev);
1149
1150         while (next_live_key != btree_bkey_last(b, t) &&
1151                bkey_deleted(next_live_key))
1152                 next_live_key = bkey_next(next_live_key);
1153
1154         /*
1155          * Everything between where and next_live_key is now deleted keys, and
1156          * is overwritten:
1157          */
1158         clobber_u64s = (u64 *) next_live_key - (u64 *) where;
1159
1160         if (prev &&
1161             bch_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
1162                 goto drop_deleted_keys;
1163
1164         if (next_live_key != btree_bkey_last(b, t) &&
1165             bch_extent_merge_inline(c, iter, bkey_to_packed(insert),
1166                                     next_live_key, false))
1167                 goto drop_deleted_keys;
1168
1169         bch_bset_insert(b, node_iter, where, insert, clobber_u64s);
1170         bch_btree_node_iter_fix(iter, b, node_iter, t, where,
1171                                 clobber_u64s, where->u64s);
1172         return;
1173 drop_deleted_keys:
1174         bch_bset_delete(b, where, clobber_u64s);
1175         bch_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
1176 }
1177
1178 static void extent_insert_committed(struct extent_insert_state *s)
1179 {
1180         struct bch_fs *c = s->trans->c;
1181         struct btree_iter *iter = s->insert->iter;
1182         struct bkey_i *insert = !s->deleting
1183                 ? s->insert->k
1184                 : &s->whiteout;
1185         BKEY_PADDED(k) split;
1186
1187         EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
1188         EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
1189
1190         if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
1191                 return;
1192
1193         if (s->deleting && !s->do_journal) {
1194                 bch_cut_front(s->committed, insert);
1195                 goto done;
1196         }
1197
1198         EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
1199
1200         bkey_copy(&split.k, insert);
1201
1202         if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
1203             bkey_cmp(s->committed, insert->k.p) &&
1204             bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
1205                 /* XXX: possibly need to increase our reservation? */
1206                 bch_cut_subtract_back(s, s->committed,
1207                                       bkey_i_to_s(&split.k));
1208                 bch_cut_front(s->committed, insert);
1209                 bch_add_sectors(s, bkey_i_to_s_c(insert),
1210                                 bkey_start_offset(&insert->k),
1211                                 insert->k.size);
1212         } else {
1213                 bch_cut_back(s->committed, &split.k.k);
1214                 bch_cut_front(s->committed, insert);
1215         }
1216
1217         if (debug_check_bkeys(c))
1218                 bkey_debugcheck(c, iter->nodes[iter->level],
1219                                 bkey_i_to_s_c(&split.k));
1220
1221         bch_btree_journal_key(s->trans, iter, &split.k);
1222
1223         if (!s->deleting)
1224                 extent_bset_insert(c, iter, &split.k);
1225 done:
1226         bch_btree_iter_set_pos_same_leaf(iter, s->committed);
1227
1228         insert->k.needs_whiteout        = false;
1229         s->do_journal                   = false;
1230         s->trans->did_work              = true;
1231 }
1232
1233 static enum extent_insert_hook_ret
1234 __extent_insert_advance_pos(struct extent_insert_state *s,
1235                             struct bpos next_pos,
1236                             struct bkey_s_c k)
1237 {
1238         struct extent_insert_hook *hook = s->trans->hook;
1239         enum extent_insert_hook_ret ret;
1240 #if 0
1241         /*
1242          * Currently disabled for encryption - broken with fcollapse. Will have
1243          * to reenable when versions are exposed for send/receive - versions
1244          * will have to be monotonic then:
1245          */
1246         if (k.k && k.k->size &&
1247             !bversion_zero(s->insert->k->k.version) &&
1248             bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
1249                 ret = BTREE_HOOK_NO_INSERT;
1250         } else
1251 #endif
1252         if (hook)
1253                 ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
1254         else
1255                 ret = BTREE_HOOK_DO_INSERT;
1256
1257         EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
1258
1259         switch (ret) {
1260         case BTREE_HOOK_DO_INSERT:
1261                 break;
1262         case BTREE_HOOK_NO_INSERT:
1263                 extent_insert_committed(s);
1264                 bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
1265
1266                 bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
1267                 break;
1268         case BTREE_HOOK_RESTART_TRANS:
1269                 return ret;
1270         }
1271
1272         s->committed = next_pos;
1273         return ret;
1274 }
1275
1276 /*
1277  * Update iter->pos, marking how much of @insert we've processed, and call hook
1278  * fn:
1279  */
1280 static enum extent_insert_hook_ret
1281 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
1282 {
1283         struct btree *b = s->insert->iter->nodes[0];
1284         struct bpos next_pos = bpos_min(s->insert->k->k.p,
1285                                         k.k ? k.k->p : b->key.k.p);
1286
1287         /* hole? */
1288         if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
1289                 bool have_uncommitted = bkey_cmp(s->committed,
1290                                 bkey_start_pos(&s->insert->k->k)) > 0;
1291
1292                 switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
1293                                                     bkey_s_c_null)) {
1294                 case BTREE_HOOK_DO_INSERT:
1295                         break;
1296                 case BTREE_HOOK_NO_INSERT:
1297                         /*
1298                          * we had to split @insert and insert the committed
1299                          * part - need to bail out and recheck journal
1300                          * reservation/btree node before we advance pos past @k:
1301                          */
1302                         if (have_uncommitted)
1303                                 return BTREE_HOOK_NO_INSERT;
1304                         break;
1305                 case BTREE_HOOK_RESTART_TRANS:
1306                         return BTREE_HOOK_RESTART_TRANS;
1307                 }
1308         }
1309
1310         /* avoid redundant calls to hook fn: */
1311         if (!bkey_cmp(s->committed, next_pos))
1312                 return BTREE_HOOK_DO_INSERT;
1313
1314         return __extent_insert_advance_pos(s, next_pos, k);
1315 }
1316
1317 static enum btree_insert_ret
1318 extent_insert_check_split_compressed(struct extent_insert_state *s,
1319                                      struct bkey_s_c k,
1320                                      enum bch_extent_overlap overlap)
1321 {
1322         struct bch_fs *c = s->trans->c;
1323         unsigned sectors;
1324
1325         if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
1326             (sectors = bkey_extent_is_compressed(k))) {
1327                 int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
1328
1329                 if (s->trans->flags & BTREE_INSERT_NOFAIL)
1330                         flags |= BCH_DISK_RESERVATION_NOFAIL;
1331
1332                 switch (bch_disk_reservation_add(c,
1333                                 s->trans->disk_res,
1334                                 sectors, flags)) {
1335                 case 0:
1336                         break;
1337                 case -ENOSPC:
1338                         return BTREE_INSERT_ENOSPC;
1339                 case -EINTR:
1340                         return BTREE_INSERT_NEED_GC_LOCK;
1341                 default:
1342                         BUG();
1343                 }
1344         }
1345
1346         return BTREE_INSERT_OK;
1347 }
1348
1349 static enum btree_insert_ret
1350 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
1351               struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
1352               enum bch_extent_overlap overlap)
1353 {
1354         struct bch_fs *c = s->trans->c;
1355         struct btree_iter *iter = s->insert->iter;
1356         struct btree *b = iter->nodes[0];
1357         struct btree_node_iter *node_iter = &iter->node_iters[0];
1358
1359         switch (overlap) {
1360         case BCH_EXTENT_OVERLAP_FRONT:
1361                 /* insert overlaps with start of k: */
1362                 bch_cut_subtract_front(s, insert->k.p, k);
1363                 BUG_ON(bkey_deleted(k.k));
1364                 extent_save(b, node_iter, _k, k.k);
1365                 break;
1366
1367         case BCH_EXTENT_OVERLAP_BACK:
1368                 /* insert overlaps with end of k: */
1369                 bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
1370                 BUG_ON(bkey_deleted(k.k));
1371                 extent_save(b, node_iter, _k, k.k);
1372
1373                 /*
1374                  * As the auxiliary tree is indexed by the end of the
1375                  * key and we've just changed the end, update the
1376                  * auxiliary tree.
1377                  */
1378                 bch_bset_fix_invalidated_key(b, t, _k);
1379                 bch_btree_node_iter_fix(iter, b, node_iter, t,
1380                                         _k, _k->u64s, _k->u64s);
1381                 break;
1382
1383         case BCH_EXTENT_OVERLAP_ALL: {
1384                 struct bpos orig_pos = k.k->p;
1385
1386                 /* The insert key completely covers k, invalidate k */
1387                 if (!bkey_whiteout(k.k))
1388                         btree_keys_account_key_drop(&b->nr,
1389                                                 t - b->set, _k);
1390
1391                 bch_drop_subtract(s, k);
1392                 k.k->p = bkey_start_pos(&insert->k);
1393                 if (!__extent_save(b, node_iter, _k, k.k)) {
1394                         /*
1395                          * Couldn't repack: we aren't necessarily able
1396                          * to repack if the new key is outside the range
1397                          * of the old extent, so we have to split
1398                          * @insert:
1399                          */
1400                         k.k->p = orig_pos;
1401                         extent_save(b, node_iter, _k, k.k);
1402
1403                         if (extent_insert_advance_pos(s, k.s_c) ==
1404                             BTREE_HOOK_RESTART_TRANS)
1405                                 return BTREE_INSERT_NEED_TRAVERSE;
1406
1407                         extent_insert_committed(s);
1408                         /*
1409                          * We split and inserted upto at k.k->p - that
1410                          * has to coincide with iter->pos, so that we
1411                          * don't have anything more we have to insert
1412                          * until we recheck our journal reservation:
1413                          */
1414                         EBUG_ON(bkey_cmp(s->committed, k.k->p));
1415                 } else {
1416                         bch_bset_fix_invalidated_key(b, t, _k);
1417                         bch_btree_node_iter_fix(iter, b, node_iter, t,
1418                                                 _k, _k->u64s, _k->u64s);
1419                 }
1420
1421                 break;
1422         }
1423         case BCH_EXTENT_OVERLAP_MIDDLE: {
1424                 BKEY_PADDED(k) split;
1425                 /*
1426                  * The insert key falls 'in the middle' of k
1427                  * The insert key splits k in 3:
1428                  * - start only in k, preserve
1429                  * - middle common section, invalidate in k
1430                  * - end only in k, preserve
1431                  *
1432                  * We update the old key to preserve the start,
1433                  * insert will be the new common section,
1434                  * we manually insert the end that we are preserving.
1435                  *
1436                  * modify k _before_ doing the insert (which will move
1437                  * what k points to)
1438                  */
1439                 bkey_reassemble(&split.k, k.s_c);
1440                 split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
1441
1442                 bch_cut_back(bkey_start_pos(&insert->k), &split.k.k);
1443                 BUG_ON(bkey_deleted(&split.k.k));
1444
1445                 bch_cut_subtract_front(s, insert->k.p, k);
1446                 BUG_ON(bkey_deleted(k.k));
1447                 extent_save(b, node_iter, _k, k.k);
1448
1449                 bch_add_sectors(s, bkey_i_to_s_c(&split.k),
1450                                 bkey_start_offset(&split.k.k),
1451                                 split.k.k.size);
1452                 extent_bset_insert(c, iter, &split.k);
1453                 break;
1454         }
1455         }
1456
1457         return BTREE_INSERT_OK;
1458 }
1459
1460 static enum btree_insert_ret
1461 bch_delete_fixup_extent(struct extent_insert_state *s)
1462 {
1463         struct bch_fs *c = s->trans->c;
1464         struct btree_iter *iter = s->insert->iter;
1465         struct btree *b = iter->nodes[0];
1466         struct btree_node_iter *node_iter = &iter->node_iters[0];
1467         struct bkey_packed *_k;
1468         struct bkey unpacked;
1469         struct bkey_i *insert = s->insert->k;
1470         enum btree_insert_ret ret = BTREE_INSERT_OK;
1471
1472         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1473
1474         s->whiteout     = *insert;
1475         s->do_journal   = false;
1476
1477         while (bkey_cmp(s->committed, insert->k.p) < 0 &&
1478                (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
1479                (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
1480                 struct bset_tree *t = bch_bkey_to_bset(b, _k);
1481                 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1482                 enum bch_extent_overlap overlap;
1483
1484                 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
1485                 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1486
1487                 if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
1488                         break;
1489
1490                 if (bkey_whiteout(k.k)) {
1491                         s->committed = bpos_min(insert->k.p, k.k->p);
1492                         goto next;
1493                 }
1494
1495                 overlap = bch_extent_overlap(&insert->k, k.k);
1496
1497                 ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
1498                 if (ret != BTREE_INSERT_OK)
1499                         goto stop;
1500
1501                 switch (extent_insert_advance_pos(s, k.s_c)) {
1502                 case BTREE_HOOK_DO_INSERT:
1503                         break;
1504                 case BTREE_HOOK_NO_INSERT:
1505                         continue;
1506                 case BTREE_HOOK_RESTART_TRANS:
1507                         ret = BTREE_INSERT_NEED_TRAVERSE;
1508                         goto stop;
1509                 }
1510
1511                 s->do_journal = true;
1512
1513                 if (overlap == BCH_EXTENT_OVERLAP_ALL) {
1514                         btree_keys_account_key_drop(&b->nr,
1515                                                 t - b->set, _k);
1516                         bch_subtract_sectors(s, k.s_c,
1517                                              bkey_start_offset(k.k), k.k->size);
1518                         _k->type = KEY_TYPE_DISCARD;
1519                         reserve_whiteout(b, t, _k);
1520                 } else if (k.k->needs_whiteout ||
1521                            bset_written(b, bset(b, t))) {
1522                         struct bkey_i discard = *insert;
1523
1524                         switch (overlap) {
1525                         case BCH_EXTENT_OVERLAP_FRONT:
1526                                 bch_cut_front(bkey_start_pos(k.k), &discard);
1527                                 break;
1528                         case BCH_EXTENT_OVERLAP_BACK:
1529                                 bch_cut_back(k.k->p, &discard.k);
1530                                 break;
1531                         default:
1532                                 break;
1533                         }
1534
1535                         discard.k.needs_whiteout = true;
1536
1537                         ret = extent_squash(s, insert, t, _k, k, overlap);
1538                         BUG_ON(ret != BTREE_INSERT_OK);
1539
1540                         extent_bset_insert(c, iter, &discard);
1541                 } else {
1542                         ret = extent_squash(s, insert, t, _k, k, overlap);
1543                         BUG_ON(ret != BTREE_INSERT_OK);
1544                 }
1545 next:
1546                 bch_cut_front(s->committed, insert);
1547                 bch_btree_iter_set_pos_same_leaf(iter, s->committed);
1548         }
1549
1550         if (bkey_cmp(s->committed, insert->k.p) < 0 &&
1551             ret == BTREE_INSERT_OK &&
1552             extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
1553                 ret = BTREE_INSERT_NEED_TRAVERSE;
1554 stop:
1555         extent_insert_committed(s);
1556
1557         bch_fs_stats_apply(c, &s->stats, s->trans->disk_res,
1558                            gc_pos_btree_node(b));
1559
1560         EBUG_ON(bkey_cmp(iter->pos, s->committed));
1561         EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
1562
1563         bch_cut_front(iter->pos, insert);
1564
1565         if (insert->k.size && iter->at_end_of_leaf)
1566                 ret = BTREE_INSERT_NEED_TRAVERSE;
1567
1568         EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
1569
1570         return ret;
1571 }
1572
1573 /**
1574  * bch_extent_insert_fixup - insert a new extent and deal with overlaps
1575  *
1576  * this may result in not actually doing the insert, or inserting some subset
1577  * of the insert key. For cmpxchg operations this is where that logic lives.
1578  *
1579  * All subsets of @insert that need to be inserted are inserted using
1580  * bch_btree_insert_and_journal(). If @b or @res fills up, this function
1581  * returns false, setting @iter->pos for the prefix of @insert that actually got
1582  * inserted.
1583  *
1584  * BSET INVARIANTS: this function is responsible for maintaining all the
1585  * invariants for bsets of extents in memory. things get really hairy with 0
1586  * size extents
1587  *
1588  * within one bset:
1589  *
1590  * bkey_start_pos(bkey_next(k)) >= k
1591  * or bkey_start_offset(bkey_next(k)) >= k->offset
1592  *
1593  * i.e. strict ordering, no overlapping extents.
1594  *
1595  * multiple bsets (i.e. full btree node):
1596  *
1597  * âˆ€ k, j
1598  *   k.size != 0 âˆ§ j.size != 0 â†’
1599  *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
1600  *
1601  * i.e. no two overlapping keys _of nonzero size_
1602  *
1603  * We can't realistically maintain this invariant for zero size keys because of
1604  * the key merging done in bch_btree_insert_key() - for two mergeable keys k, j
1605  * there may be another 0 size key between them in another bset, and it will
1606  * thus overlap with the merged key.
1607  *
1608  * In addition, the end of iter->pos indicates how much has been processed.
1609  * If the end of iter->pos is not the same as the end of insert, then
1610  * key insertion needs to continue/be retried.
1611  */
1612 enum btree_insert_ret
1613 bch_insert_fixup_extent(struct btree_insert *trans,
1614                         struct btree_insert_entry *insert)
1615 {
1616         struct bch_fs *c = trans->c;
1617         struct btree_iter *iter = insert->iter;
1618         struct btree *b = iter->nodes[0];
1619         struct btree_node_iter *node_iter = &iter->node_iters[0];
1620         struct bkey_packed *_k;
1621         struct bkey unpacked;
1622         enum btree_insert_ret ret = BTREE_INSERT_OK;
1623
1624         struct extent_insert_state s = {
1625                 .trans          = trans,
1626                 .insert         = insert,
1627                 .committed      = insert->iter->pos,
1628                 .deleting       = bkey_whiteout(&insert->k->k),
1629         };
1630
1631         EBUG_ON(iter->level);
1632         EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
1633
1634         if (s.deleting)
1635                 return bch_delete_fixup_extent(&s);
1636
1637         /*
1638          * As we process overlapping extents, we advance @iter->pos both to
1639          * signal to our caller (btree_insert_key()) how much of @insert->k has
1640          * been inserted, and also to keep @iter->pos consistent with
1641          * @insert->k and the node iterator that we're advancing:
1642          */
1643         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1644
1645         if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1646                 bch_add_sectors(&s, bkey_i_to_s_c(insert->k),
1647                                 bkey_start_offset(&insert->k->k),
1648                                 insert->k->k.size);
1649
1650         while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
1651                (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
1652                (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
1653                 struct bset_tree *t = bch_bkey_to_bset(b, _k);
1654                 struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
1655                 enum bch_extent_overlap overlap;
1656
1657                 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1658                 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1659
1660                 if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
1661                         break;
1662
1663                 overlap = bch_extent_overlap(&insert->k->k, k.k);
1664
1665                 ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
1666                 if (ret != BTREE_INSERT_OK)
1667                         goto stop;
1668
1669                 if (!k.k->size)
1670                         goto squash;
1671
1672                 /*
1673                  * Only call advance pos & call hook for nonzero size extents:
1674                  * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
1675                  * overlaps with @k:
1676                  */
1677                 switch (extent_insert_advance_pos(&s, k.s_c)) {
1678                 case BTREE_HOOK_DO_INSERT:
1679                         break;
1680                 case BTREE_HOOK_NO_INSERT:
1681                         continue;
1682                 case BTREE_HOOK_RESTART_TRANS:
1683                         ret = BTREE_INSERT_NEED_TRAVERSE;
1684                         goto stop;
1685                 }
1686
1687                 if (k.k->size &&
1688                     (k.k->needs_whiteout || bset_written(b, bset(b, t))))
1689                         insert->k->k.needs_whiteout = true;
1690
1691                 if (overlap == BCH_EXTENT_OVERLAP_ALL &&
1692                     bkey_whiteout(k.k) &&
1693                     k.k->needs_whiteout) {
1694                         unreserve_whiteout(b, t, _k);
1695                         _k->needs_whiteout = false;
1696                 }
1697 squash:
1698                 ret = extent_squash(&s, insert->k, t, _k, k, overlap);
1699                 if (ret != BTREE_INSERT_OK)
1700                         goto stop;
1701         }
1702
1703         if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
1704             ret == BTREE_INSERT_OK &&
1705             extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
1706                 ret = BTREE_INSERT_NEED_TRAVERSE;
1707 stop:
1708         extent_insert_committed(&s);
1709         /*
1710          * Subtract any remaining sectors from @insert, if we bailed out early
1711          * and didn't fully insert @insert:
1712          */
1713         if (insert->k->k.size &&
1714             !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
1715                 bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
1716                                      bkey_start_offset(&insert->k->k),
1717                                      insert->k->k.size);
1718
1719         bch_fs_stats_apply(c, &s.stats, trans->disk_res,
1720                            gc_pos_btree_node(b));
1721
1722         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1723         EBUG_ON(bkey_cmp(iter->pos, s.committed));
1724         EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
1725
1726         if (insert->k->k.size && iter->at_end_of_leaf)
1727                 ret = BTREE_INSERT_NEED_TRAVERSE;
1728
1729         EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
1730
1731         return ret;
1732 }
1733
1734 static const char *bch_extent_invalid(const struct bch_fs *c,
1735                                       struct bkey_s_c k)
1736 {
1737         if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
1738                 return "value too big";
1739
1740         if (!k.k->size)
1741                 return "zero key size";
1742
1743         switch (k.k->type) {
1744         case BCH_EXTENT:
1745         case BCH_EXTENT_CACHED: {
1746                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
1747                 const union bch_extent_entry *entry;
1748                 const union bch_extent_crc *crc;
1749                 const struct bch_extent_ptr *ptr;
1750                 unsigned size_ondisk = e.k->size;
1751                 const char *reason;
1752
1753                 extent_for_each_entry(e, entry) {
1754                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
1755                                 return "invalid extent entry type";
1756
1757                         if (extent_entry_is_crc(entry)) {
1758                                 crc = entry_to_crc(entry);
1759
1760                                 if (crc_offset(crc) + e.k->size >
1761                                     crc_uncompressed_size(e.k, crc))
1762                                         return "checksum offset + key size > uncompressed size";
1763
1764                                 size_ondisk = crc_compressed_size(e.k, crc);
1765
1766                                 if (!bch_checksum_type_valid(c, crc_csum_type(crc)))
1767                                         return "invalid checksum type";
1768
1769                                 if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
1770                                         return "invalid compression type";
1771                         } else {
1772                                 ptr = entry_to_ptr(entry);
1773
1774                                 reason = extent_ptr_invalid(c, e, &entry->ptr,
1775                                                             size_ondisk, false);
1776                                 if (reason)
1777                                         return reason;
1778                         }
1779                 }
1780
1781                 return NULL;
1782         }
1783
1784         case BCH_RESERVATION: {
1785                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
1786
1787                 if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
1788                         return "incorrect value size";
1789
1790                 if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
1791                         return "invalid nr_replicas";
1792
1793                 return NULL;
1794         }
1795
1796         default:
1797                 return "invalid value type";
1798         }
1799 }
1800
1801 static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
1802                                          struct bkey_s_c_extent e)
1803 {
1804         const struct bch_extent_ptr *ptr;
1805         struct bch_dev *ca;
1806         struct bucket *g;
1807         unsigned seq, stale;
1808         char buf[160];
1809         bool bad;
1810         unsigned ptrs_per_tier[BCH_TIER_MAX];
1811         unsigned replicas = 0;
1812
1813         /*
1814          * XXX: we should be doing most/all of these checks at startup time,
1815          * where we check bkey_invalid() in btree_node_read_done()
1816          *
1817          * But note that we can't check for stale pointers or incorrect gc marks
1818          * until after journal replay is done (it might be an extent that's
1819          * going to get overwritten during replay)
1820          */
1821
1822         memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
1823
1824         extent_for_each_ptr(e, ptr) {
1825                 ca = c->devs[ptr->dev];
1826                 g = PTR_BUCKET(ca, ptr);
1827                 replicas++;
1828                 ptrs_per_tier[ca->mi.tier]++;
1829
1830                 /*
1831                  * If journal replay hasn't finished, we might be seeing keys
1832                  * that will be overwritten by the time journal replay is done:
1833                  */
1834                 if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
1835                         continue;
1836
1837                 stale = 0;
1838
1839                 do {
1840                         struct bucket_mark mark;
1841
1842                         seq = read_seqcount_begin(&c->gc_pos_lock);
1843                         mark = READ_ONCE(g->mark);
1844
1845                         /* between mark and bucket gen */
1846                         smp_rmb();
1847
1848                         stale = ptr_stale(ca, ptr);
1849
1850                         bch_fs_bug_on(stale && !ptr->cached, c,
1851                                          "stale dirty pointer");
1852
1853                         bch_fs_bug_on(stale > 96, c,
1854                                          "key too stale: %i",
1855                                          stale);
1856
1857                         if (stale)
1858                                 break;
1859
1860                         bad = (mark.data_type != BUCKET_DATA ||
1861                                (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
1862                                 !mark.owned_by_allocator &&
1863                                 !(ptr->cached
1864                                   ? mark.cached_sectors
1865                                   : mark.dirty_sectors)));
1866                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
1867
1868                 if (bad)
1869                         goto bad_ptr;
1870         }
1871
1872         if (replicas > BCH_REPLICAS_MAX) {
1873                 bch_bkey_val_to_text(c, btree_node_type(b), buf,
1874                                      sizeof(buf), e.s_c);
1875                 bch_fs_bug(c,
1876                         "extent key bad (too many replicas: %u): %s",
1877                         replicas, buf);
1878                 return;
1879         }
1880
1881         if (!bkey_extent_is_cached(e.k) &&
1882             replicas < c->sb.data_replicas_have) {
1883                 bch_bkey_val_to_text(c, btree_node_type(b), buf,
1884                                      sizeof(buf), e.s_c);
1885                 bch_fs_bug(c,
1886                         "extent key bad (too few replicas, %u < %u): %s",
1887                         replicas, c->sb.data_replicas_have, buf);
1888                 return;
1889         }
1890
1891         return;
1892
1893 bad_ptr:
1894         bch_bkey_val_to_text(c, btree_node_type(b), buf,
1895                              sizeof(buf), e.s_c);
1896         bch_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
1897                    "gen %i last_gc %i mark 0x%08x",
1898                    buf, PTR_BUCKET_NR(ca, ptr),
1899                    g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
1900                    ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
1901                    (unsigned) g->mark.counter);
1902         return;
1903 }
1904
1905 static void bch_extent_debugcheck(struct bch_fs *c, struct btree *b,
1906                                   struct bkey_s_c k)
1907 {
1908         switch (k.k->type) {
1909         case BCH_EXTENT:
1910         case BCH_EXTENT_CACHED:
1911                 bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
1912                 break;
1913         case BCH_RESERVATION:
1914                 break;
1915         default:
1916                 BUG();
1917         }
1918 }
1919
1920 static void bch_extent_to_text(struct bch_fs *c, char *buf,
1921                                size_t size, struct bkey_s_c k)
1922 {
1923         char *out = buf, *end = buf + size;
1924         const char *invalid;
1925
1926 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
1927
1928         if (bkey_extent_is_data(k.k))
1929                 out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
1930
1931         invalid = bch_extent_invalid(c, k);
1932         if (invalid)
1933                 p(" invalid: %s", invalid);
1934 #undef p
1935 }
1936
1937 static unsigned PTR_TIER(struct bch_fs *c,
1938                          const struct bch_extent_ptr *ptr)
1939 {
1940         return c->devs[ptr->dev]->mi.tier;
1941 }
1942
1943 static void bch_extent_crc_init(union bch_extent_crc *crc,
1944                                 unsigned compressed_size,
1945                                 unsigned uncompressed_size,
1946                                 unsigned compression_type,
1947                                 unsigned nonce,
1948                                 struct bch_csum csum, unsigned csum_type)
1949 {
1950         if (bch_crc_bytes[csum_type]    <= 4 &&
1951             uncompressed_size           <= CRC32_SIZE_MAX &&
1952             nonce                       <= CRC32_NONCE_MAX) {
1953                 crc->crc32 = (struct bch_extent_crc32) {
1954                         .type = 1 << BCH_EXTENT_ENTRY_crc32,
1955                         ._compressed_size       = compressed_size - 1,
1956                         ._uncompressed_size     = uncompressed_size - 1,
1957                         .offset                 = 0,
1958                         .compression_type       = compression_type,
1959                         .csum_type              = csum_type,
1960                         .csum                   = *((__le32 *) &csum.lo),
1961                 };
1962                 return;
1963         }
1964
1965         if (bch_crc_bytes[csum_type]    <= 10 &&
1966             uncompressed_size           <= CRC64_SIZE_MAX &&
1967             nonce                       <= CRC64_NONCE_MAX) {
1968                 crc->crc64 = (struct bch_extent_crc64) {
1969                         .type = 1 << BCH_EXTENT_ENTRY_crc64,
1970                         ._compressed_size       = compressed_size - 1,
1971                         ._uncompressed_size     = uncompressed_size - 1,
1972                         .offset                 = 0,
1973                         .nonce                  = nonce,
1974                         .compression_type       = compression_type,
1975                         .csum_type              = csum_type,
1976                         .csum_lo                = csum.lo,
1977                         .csum_hi                = *((__le16 *) &csum.hi),
1978                 };
1979                 return;
1980         }
1981
1982         if (bch_crc_bytes[csum_type]    <= 16 &&
1983             uncompressed_size           <= CRC128_SIZE_MAX &&
1984             nonce                       <= CRC128_NONCE_MAX) {
1985                 crc->crc128 = (struct bch_extent_crc128) {
1986                         .type = 1 << BCH_EXTENT_ENTRY_crc128,
1987                         ._compressed_size       = compressed_size - 1,
1988                         ._uncompressed_size     = uncompressed_size - 1,
1989                         .offset                 = 0,
1990                         .nonce                  = nonce,
1991                         .compression_type       = compression_type,
1992                         .csum_type              = csum_type,
1993                         .csum                   = csum,
1994                 };
1995                 return;
1996         }
1997
1998         BUG();
1999 }
2000
2001 void bch_extent_crc_append(struct bkey_i_extent *e,
2002                            unsigned compressed_size,
2003                            unsigned uncompressed_size,
2004                            unsigned compression_type,
2005                            unsigned nonce,
2006                            struct bch_csum csum, unsigned csum_type)
2007 {
2008         union bch_extent_crc *crc;
2009
2010         BUG_ON(compressed_size > uncompressed_size);
2011         BUG_ON(uncompressed_size != e->k.size);
2012         BUG_ON(!compressed_size || !uncompressed_size);
2013
2014         /*
2015          * Look up the last crc entry, so we can check if we need to add
2016          * another:
2017          */
2018         extent_for_each_crc(extent_i_to_s(e), crc)
2019                 ;
2020
2021         if (!crc && !csum_type && !compression_type)
2022                 return;
2023
2024         if (crc &&
2025             crc_compressed_size(&e->k, crc)     == compressed_size &&
2026             crc_uncompressed_size(&e->k, crc)   == uncompressed_size &&
2027             crc_offset(crc)                     == 0 &&
2028             crc_nonce(crc)                      == nonce &&
2029             crc_csum_type(crc)                  == csum_type &&
2030             crc_compression_type(crc)           == compression_type &&
2031             crc_csum(crc).lo                    == csum.lo &&
2032             crc_csum(crc).hi                    == csum.hi)
2033                 return;
2034
2035         bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
2036                             compressed_size,
2037                             uncompressed_size,
2038                             compression_type,
2039                             nonce, csum, csum_type);
2040         __extent_entry_push(e);
2041 }
2042
2043 /*
2044  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
2045  *
2046  * Returns true if @k should be dropped entirely
2047  *
2048  * For existing keys, only called when btree nodes are being rewritten, not when
2049  * they're merely being compacted/resorted in memory.
2050  */
2051 bool bch_extent_normalize(struct bch_fs *c, struct bkey_s k)
2052 {
2053         struct bkey_s_extent e;
2054
2055         switch (k.k->type) {
2056         case KEY_TYPE_ERROR:
2057                 return false;
2058
2059         case KEY_TYPE_DELETED:
2060         case KEY_TYPE_COOKIE:
2061                 return true;
2062
2063         case KEY_TYPE_DISCARD:
2064                 return bversion_zero(k.k->version);
2065
2066         case BCH_EXTENT:
2067         case BCH_EXTENT_CACHED:
2068                 e = bkey_s_to_extent(k);
2069
2070                 bch_extent_drop_stale(c, e);
2071
2072                 if (!bkey_val_u64s(e.k)) {
2073                         if (bkey_extent_is_cached(e.k)) {
2074                                 k.k->type = KEY_TYPE_DISCARD;
2075                                 if (bversion_zero(k.k->version))
2076                                         return true;
2077                         } else {
2078                                 k.k->type = KEY_TYPE_ERROR;
2079                         }
2080                 }
2081
2082                 return false;
2083         case BCH_RESERVATION:
2084                 return false;
2085         default:
2086                 BUG();
2087         }
2088 }
2089
2090 void bch_extent_mark_replicas_cached(struct bch_fs *c,
2091                                      struct bkey_s_extent e,
2092                                      unsigned nr_cached)
2093 {
2094         struct bch_extent_ptr *ptr;
2095         bool have_higher_tier;
2096         unsigned tier = 0;
2097
2098         if (!nr_cached)
2099                 return;
2100
2101         do {
2102                 have_higher_tier = false;
2103
2104                 extent_for_each_ptr(e, ptr) {
2105                         if (!ptr->cached &&
2106                             PTR_TIER(c, ptr) == tier) {
2107                                 ptr->cached = true;
2108                                 nr_cached--;
2109                                 if (!nr_cached)
2110                                         return;
2111                         }
2112
2113                         if (PTR_TIER(c, ptr) > tier)
2114                                 have_higher_tier = true;
2115                 }
2116
2117                 tier++;
2118         } while (have_higher_tier);
2119 }
2120
2121 /*
2122  * This picks a non-stale pointer, preferabbly from a device other than
2123  * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
2124  * pointers to other devices, it will still pick a pointer from avoid.
2125  * Note that it prefers lowered-numbered pointers to higher-numbered pointers
2126  * as the pointers are sorted by tier, hence preferring pointers to tier 0
2127  * rather than pointers to tier 1.
2128  */
2129 void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
2130                                   struct bch_dev *avoid,
2131                                   struct extent_pick_ptr *ret)
2132 {
2133         struct bkey_s_c_extent e;
2134         const union bch_extent_crc *crc;
2135         const struct bch_extent_ptr *ptr;
2136
2137         switch (k.k->type) {
2138         case KEY_TYPE_DELETED:
2139         case KEY_TYPE_DISCARD:
2140         case KEY_TYPE_COOKIE:
2141                 ret->ca = NULL;
2142                 return;
2143
2144         case KEY_TYPE_ERROR:
2145                 ret->ca = ERR_PTR(-EIO);
2146                 return;
2147
2148         case BCH_EXTENT:
2149         case BCH_EXTENT_CACHED:
2150                 e = bkey_s_c_to_extent(k);
2151                 ret->ca = NULL;
2152
2153                 extent_for_each_ptr_crc(e, ptr, crc) {
2154                         struct bch_dev *ca = c->devs[ptr->dev];
2155
2156                         if (ptr_stale(ca, ptr))
2157                                 continue;
2158
2159                         if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
2160                                 continue;
2161
2162                         if (ret->ca &&
2163                             (ca == avoid ||
2164                              ret->ca->mi.tier < ca->mi.tier))
2165                                 continue;
2166
2167                         if (!percpu_ref_tryget(&ca->io_ref))
2168                                 continue;
2169
2170                         if (ret->ca)
2171                                 percpu_ref_put(&ret->ca->io_ref);
2172
2173                         *ret = (struct extent_pick_ptr) {
2174                                 .crc = crc_to_128(e.k, crc),
2175                                 .ptr = *ptr,
2176                                 .ca = ca,
2177                         };
2178                 }
2179
2180                 if (!ret->ca && !bkey_extent_is_cached(e.k))
2181                         ret->ca = ERR_PTR(-EIO);
2182                 return;
2183
2184         case BCH_RESERVATION:
2185                 ret->ca = NULL;
2186                 return;
2187
2188         default:
2189                 BUG();
2190         }
2191 }
2192
2193 static enum merge_result bch_extent_merge(struct bch_fs *c,
2194                                           struct btree *bk,
2195                                           struct bkey_i *l, struct bkey_i *r)
2196 {
2197         struct bkey_s_extent el, er;
2198         union bch_extent_entry *en_l, *en_r;
2199
2200         if (key_merging_disabled(c))
2201                 return BCH_MERGE_NOMERGE;
2202
2203         /*
2204          * Generic header checks
2205          * Assumes left and right are in order
2206          * Left and right must be exactly aligned
2207          */
2208
2209         if (l->k.u64s           != r->k.u64s ||
2210             l->k.type           != r->k.type ||
2211             bversion_cmp(l->k.version, r->k.version) ||
2212             bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
2213                 return BCH_MERGE_NOMERGE;
2214
2215         switch (l->k.type) {
2216         case KEY_TYPE_DELETED:
2217         case KEY_TYPE_DISCARD:
2218         case KEY_TYPE_ERROR:
2219                 /* These types are mergeable, and no val to check */
2220                 break;
2221
2222         case BCH_EXTENT:
2223         case BCH_EXTENT_CACHED:
2224                 el = bkey_i_to_s_extent(l);
2225                 er = bkey_i_to_s_extent(r);
2226
2227                 extent_for_each_entry(el, en_l) {
2228                         struct bch_extent_ptr *lp, *rp;
2229                         unsigned bucket_size;
2230
2231                         en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
2232
2233                         if ((extent_entry_type(en_l) !=
2234                              extent_entry_type(en_r)) ||
2235                             extent_entry_is_crc(en_l))
2236                                 return BCH_MERGE_NOMERGE;
2237
2238                         lp = &en_l->ptr;
2239                         rp = &en_r->ptr;
2240
2241                         if (lp->offset + el.k->size     != rp->offset ||
2242                             lp->dev                     != rp->dev ||
2243                             lp->gen                     != rp->gen)
2244                                 return BCH_MERGE_NOMERGE;
2245
2246                         /* We don't allow extents to straddle buckets: */
2247                         bucket_size = c->devs[lp->dev]->mi.bucket_size;
2248
2249                         if ((lp->offset & ~((u64) bucket_size - 1)) !=
2250                             (rp->offset & ~((u64) bucket_size - 1)))
2251                                 return BCH_MERGE_NOMERGE;
2252                 }
2253
2254                 break;
2255         case BCH_RESERVATION: {
2256                 struct bkey_i_reservation *li = bkey_i_to_reservation(l);
2257                 struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
2258
2259                 if (li->v.generation != ri->v.generation ||
2260                     li->v.nr_replicas != ri->v.nr_replicas)
2261                         return BCH_MERGE_NOMERGE;
2262                 break;
2263         }
2264         default:
2265                 return BCH_MERGE_NOMERGE;
2266         }
2267
2268         l->k.needs_whiteout |= r->k.needs_whiteout;
2269
2270         /* Keys with no pointers aren't restricted to one bucket and could
2271          * overflow KEY_SIZE
2272          */
2273         if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
2274                 bch_key_resize(&l->k, KEY_SIZE_MAX);
2275                 bch_cut_front(l->k.p, r);
2276                 return BCH_MERGE_PARTIAL;
2277         }
2278
2279         bch_key_resize(&l->k, l->k.size + r->k.size);
2280
2281         return BCH_MERGE_MERGE;
2282 }
2283
2284 static void extent_i_save(struct btree *b, struct bkey_packed *dst,
2285                           struct bkey_i *src)
2286 {
2287         struct bkey_format *f = &b->format;
2288         struct bkey_i *dst_unpacked;
2289
2290         BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
2291
2292         /*
2293          * We don't want the bch_verify_key_order() call in extent_save(),
2294          * because we may be out of order with deleted keys that are about to be
2295          * removed by extent_bset_insert()
2296          */
2297
2298         if ((dst_unpacked = packed_to_bkey(dst)))
2299                 bkey_copy(dst_unpacked, src);
2300         else
2301                 BUG_ON(!bkey_pack(dst, src, f));
2302 }
2303
2304 static bool extent_merge_one_overlapping(struct btree_iter *iter,
2305                                          struct bpos new_pos,
2306                                          struct bset_tree *t,
2307                                          struct bkey_packed *k, struct bkey uk,
2308                                          bool check, bool could_pack)
2309 {
2310         struct btree *b = iter->nodes[0];
2311         struct btree_node_iter *node_iter = &iter->node_iters[0];
2312
2313         BUG_ON(!bkey_deleted(k));
2314
2315         if (check) {
2316                 return !bkey_packed(k) || could_pack;
2317         } else {
2318                 uk.p = new_pos;
2319                 extent_save(b, node_iter, k, &uk);
2320                 bch_bset_fix_invalidated_key(b, t, k);
2321                 bch_btree_node_iter_fix(iter, b, node_iter, t,
2322                                         k, k->u64s, k->u64s);
2323                 return true;
2324         }
2325 }
2326
2327 static bool extent_merge_do_overlapping(struct btree_iter *iter,
2328                                         struct bkey *m, bool back_merge)
2329 {
2330         struct btree *b = iter->nodes[0];
2331         struct btree_node_iter *node_iter = &iter->node_iters[0];
2332         struct bset_tree *t;
2333         struct bkey_packed *k;
2334         struct bkey uk;
2335         struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
2336         bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
2337         bool check = true;
2338
2339         /*
2340          * @m is the new merged extent:
2341          *
2342          * The merge took place in the last bset; we know there can't be any 0
2343          * size extents overlapping with m there because if so they would have
2344          * been between the two extents we merged.
2345          *
2346          * But in the other bsets, we have to check for and fix such extents:
2347          */
2348 do_fixup:
2349         for_each_bset(b, t) {
2350                 if (t == bset_tree_last(b))
2351                         break;
2352
2353                 /*
2354                  * if we don't find this bset in the iterator we already got to
2355                  * the end of that bset, so start searching from the end.
2356                  */
2357                 k = bch_btree_node_iter_bset_pos(node_iter, b, t);
2358
2359                 if (k == btree_bkey_last(b, t))
2360                         k = bkey_prev_all(b, t, k);
2361                 if (!k)
2362                         continue;
2363
2364                 if (back_merge) {
2365                         /*
2366                          * Back merge: 0 size extents will be before the key
2367                          * that was just inserted (and thus the iterator
2368                          * position) - walk backwards to find them
2369                          */
2370                         for (;
2371                              k &&
2372                              (uk = bkey_unpack_key(b, k),
2373                               bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
2374                              k = bkey_prev_all(b, t, k)) {
2375                                 if (bkey_cmp(uk.p, m->p) >= 0)
2376                                         continue;
2377
2378                                 if (!extent_merge_one_overlapping(iter, new_pos,
2379                                                 t, k, uk, check, could_pack))
2380                                         return false;
2381                         }
2382                 } else {
2383                         /* Front merge - walk forwards */
2384                         for (;
2385                              k != btree_bkey_last(b, t) &&
2386                              (uk = bkey_unpack_key(b, k),
2387                               bkey_cmp(uk.p, m->p) < 0);
2388                              k = bkey_next(k)) {
2389                                 if (bkey_cmp(uk.p,
2390                                              bkey_start_pos(m)) <= 0)
2391                                         continue;
2392
2393                                 if (!extent_merge_one_overlapping(iter, new_pos,
2394                                                 t, k, uk, check, could_pack))
2395                                         return false;
2396                         }
2397                 }
2398         }
2399
2400         if (check) {
2401                 check = false;
2402                 goto do_fixup;
2403         }
2404
2405         return true;
2406 }
2407
2408 /*
2409  * When merging an extent that we're inserting into a btree node, the new merged
2410  * extent could overlap with an existing 0 size extent - if we don't fix that,
2411  * it'll break the btree node iterator so this code finds those 0 size extents
2412  * and shifts them out of the way.
2413  *
2414  * Also unpacks and repacks.
2415  */
2416 static bool bch_extent_merge_inline(struct bch_fs *c,
2417                                     struct btree_iter *iter,
2418                                     struct bkey_packed *l,
2419                                     struct bkey_packed *r,
2420                                     bool back_merge)
2421 {
2422         struct btree *b = iter->nodes[0];
2423         struct btree_node_iter *node_iter = &iter->node_iters[0];
2424         const struct bkey_format *f = &b->format;
2425         struct bset_tree *t = bset_tree_last(b);
2426         struct bkey_packed *m;
2427         BKEY_PADDED(k) li;
2428         BKEY_PADDED(k) ri;
2429         struct bkey_i *mi;
2430         struct bkey tmp;
2431
2432         /*
2433          * We need to save copies of both l and r, because we might get a
2434          * partial merge (which modifies both) and then fails to repack
2435          */
2436         bkey_unpack(b, &li.k, l);
2437         bkey_unpack(b, &ri.k, r);
2438
2439         m = back_merge ? l : r;
2440         mi = back_merge ? &li.k : &ri.k;
2441
2442         /* l & r should be in last bset: */
2443         EBUG_ON(bch_bkey_to_bset(b, m) != t);
2444
2445         switch (bch_extent_merge(c, b, &li.k, &ri.k)) {
2446         case BCH_MERGE_NOMERGE:
2447                 return false;
2448         case BCH_MERGE_PARTIAL:
2449                 if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &mi->k, f))
2450                         return false;
2451
2452                 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2453                         return false;
2454
2455                 extent_i_save(b, m, mi);
2456                 bch_bset_fix_invalidated_key(b, t, m);
2457
2458                 /*
2459                  * Update iterator to reflect what we just inserted - otherwise,
2460                  * the iter_fix() call is going to put us _before_ the key we
2461                  * just partially merged with:
2462                  */
2463                 if (back_merge)
2464                         bch_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
2465
2466                 bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
2467                                         t, m, m->u64s, m->u64s);
2468
2469                 if (!back_merge)
2470                         bkey_copy(packed_to_bkey(l), &li.k);
2471                 else
2472                         bkey_copy(packed_to_bkey(r), &ri.k);
2473                 return false;
2474         case BCH_MERGE_MERGE:
2475                 if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &li.k.k, f))
2476                         return false;
2477
2478                 if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
2479                         return false;
2480
2481                 extent_i_save(b, m, &li.k);
2482                 bch_bset_fix_invalidated_key(b, t, m);
2483
2484                 bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
2485                                         t, m, m->u64s, m->u64s);
2486                 return true;
2487         default:
2488                 BUG();
2489         }
2490 }
2491
2492 const struct bkey_ops bch_bkey_extent_ops = {
2493         .key_invalid    = bch_extent_invalid,
2494         .key_debugcheck = bch_extent_debugcheck,
2495         .val_to_text    = bch_extent_to_text,
2496         .swab           = bch_ptr_swab,
2497         .key_normalize  = bch_ptr_normalize,
2498         .key_merge      = bch_extent_merge,
2499         .is_extents     = true,
2500 };