]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
Update bcachefs sources to 940d6ca657 bcachefs: acl code improvements
[bcachefs-tools-debian] / libbcachefs / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "btree_cache.h"
10 #include "bset.h"
11 #include "eytzinger.h"
12 #include "util.h"
13
14 #include <asm/unaligned.h>
15 #include <linux/dynamic_fault.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcachefs.h>
23
24 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
25 {
26         struct bset_tree *t;
27
28         for_each_bset(b, t)
29                 if (k >= btree_bkey_first(b, t) &&
30                     k < btree_bkey_last(b, t))
31                         return t;
32
33         BUG();
34 }
35
36 /*
37  * There are never duplicate live keys in the btree - but including keys that
38  * have been flagged as deleted (and will be cleaned up later) we _will_ see
39  * duplicates.
40  *
41  * Thus the sort order is: usual key comparison first, but for keys that compare
42  * equal the deleted key(s) come first, and the (at most one) live version comes
43  * last.
44  *
45  * The main reason for this is insertion: to handle overwrites, we first iterate
46  * over keys that compare equal to our insert key, and then insert immediately
47  * prior to the first key greater than the key we're inserting - our insert
48  * position will be after all keys that compare equal to our insert key, which
49  * by the time we actually do the insert will all be deleted.
50  */
51
52 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
53 {
54         struct bkey_packed *_k, *_n;
55         struct bkey k, n;
56         char buf[120];
57
58         if (!i->u64s)
59                 return;
60
61         for (_k = i->start, k = bkey_unpack_key(b, _k);
62              _k < vstruct_last(i);
63              _k = _n, k = n) {
64                 _n = bkey_next(_k);
65
66                 bch2_bkey_to_text(buf, sizeof(buf), &k);
67                 printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
68                        _k->_data - i->_data, i->u64s, buf);
69
70                 if (_n == vstruct_last(i))
71                         continue;
72
73                 n = bkey_unpack_key(b, _n);
74
75                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
76                         printk(KERN_ERR "Key skipped backwards\n");
77                         continue;
78                 }
79
80                 /*
81                  * Weird check for duplicate non extent keys: extents are
82                  * deleted iff they have 0 size, so if it has zero size and it's
83                  * not deleted these aren't extents:
84                  */
85                 if (((!k.size && !bkey_deleted(&k)) ||
86                      (!n.size && !bkey_deleted(&n))) &&
87                     !bkey_deleted(&k) &&
88                     !bkey_cmp(n.p, k.p))
89                         printk(KERN_ERR "Duplicate keys\n");
90         }
91 }
92
93 void bch2_dump_btree_node(struct btree *b)
94 {
95         struct bset_tree *t;
96
97         console_lock();
98         for_each_bset(b, t)
99                 bch2_dump_bset(b, bset(b, t), t - b->set);
100         console_unlock();
101 }
102
103 void bch2_dump_btree_node_iter(struct btree *b,
104                               struct btree_node_iter *iter)
105 {
106         struct btree_node_iter_set *set;
107
108         printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
109
110         btree_node_iter_for_each(iter, set) {
111                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
112                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
113                 struct bkey uk = bkey_unpack_key(b, k);
114                 char buf[100];
115
116                 bch2_bkey_to_text(buf, sizeof(buf), &uk);
117                 printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
118                        k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
119         }
120 }
121
122 #ifdef CONFIG_BCACHEFS_DEBUG
123
124 static bool keys_out_of_order(struct btree *b,
125                               const struct bkey_packed *prev,
126                               const struct bkey_packed *next,
127                               bool is_extents)
128 {
129         struct bkey nextu = bkey_unpack_key(b, next);
130
131         return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
132                 ((is_extents
133                   ? !bkey_deleted(next)
134                   : !bkey_deleted(prev)) &&
135                  !bkey_cmp_packed(b, prev, next));
136 }
137
138 void __bch2_verify_btree_nr_keys(struct btree *b)
139 {
140         struct bset_tree *t;
141         struct bkey_packed *k;
142         struct btree_nr_keys nr = { 0 };
143
144         for_each_bset(b, t)
145                 for (k = btree_bkey_first(b, t);
146                      k != btree_bkey_last(b, t);
147                      k = bkey_next(k))
148                         if (!bkey_whiteout(k))
149                                 btree_keys_account_key_add(&nr, t - b->set, k);
150
151         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
152 }
153
154 static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
155                                            struct btree *b,
156                                            struct bkey_packed *k)
157 {
158         const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
159
160         bkey_unpack_key(b, k);
161
162         if (n &&
163             keys_out_of_order(b, k, n, iter->is_extents)) {
164                 struct bkey ku = bkey_unpack_key(b, k);
165                 struct bkey nu = bkey_unpack_key(b, n);
166                 char buf1[80], buf2[80];
167
168                 bch2_dump_btree_node(b);
169                 bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
170                 bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
171                 panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
172         }
173 }
174
175 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
176                                 struct btree *b)
177 {
178         struct btree_node_iter_set *set, *prev = NULL;
179         struct bset_tree *t;
180         struct bkey_packed *k, *first;
181
182         if (bch2_btree_node_iter_end(iter))
183                 return;
184
185         btree_node_iter_for_each(iter, set) {
186                 k = __btree_node_offset_to_key(b, set->k);
187                 t = bch2_bkey_to_bset(b, k);
188
189                 BUG_ON(__btree_node_offset_to_key(b, set->end) !=
190                        btree_bkey_last(b, t));
191
192                 BUG_ON(prev &&
193                        btree_node_iter_cmp(iter, b, *prev, *set) > 0);
194
195                 prev = set;
196         }
197
198         first = __btree_node_offset_to_key(b, iter->data[0].k);
199
200         for_each_bset(b, t)
201                 if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
202                     btree_bkey_last(b, t) &&
203                     (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
204                         BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
205                                                      k, first) > 0);
206 }
207
208 void bch2_verify_key_order(struct btree *b,
209                           struct btree_node_iter *iter,
210                           struct bkey_packed *where)
211 {
212         struct bset_tree *t = bch2_bkey_to_bset(b, where);
213         struct bkey_packed *k, *prev;
214         struct bkey uk, uw = bkey_unpack_key(b, where);
215
216         k = bch2_bkey_prev_all(b, t, where);
217         if (k &&
218             keys_out_of_order(b, k, where, iter->is_extents)) {
219                 char buf1[100], buf2[100];
220
221                 bch2_dump_btree_node(b);
222                 uk = bkey_unpack_key(b, k);
223                 bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
224                 bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
225                 panic("out of order with prev:\n%s\n%s\n",
226                       buf1, buf2);
227         }
228
229         k = bkey_next(where);
230         BUG_ON(k != btree_bkey_last(b, t) &&
231                keys_out_of_order(b, where, k, iter->is_extents));
232
233         for_each_bset(b, t) {
234                 if (where >= btree_bkey_first(b, t) ||
235                     where < btree_bkey_last(b, t))
236                         continue;
237
238                 k = bch2_btree_node_iter_bset_pos(iter, b, t);
239
240                 if (k == btree_bkey_last(b, t))
241                         k = bch2_bkey_prev_all(b, t, k);
242
243                 while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
244                        (prev = bch2_bkey_prev_all(b, t, k)))
245                         k = prev;
246
247                 for (;
248                      k != btree_bkey_last(b, t);
249                      k = bkey_next(k)) {
250                         uk = bkey_unpack_key(b, k);
251
252                         if (iter->is_extents) {
253                                 BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
254                                          bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
255                         } else {
256                                 BUG_ON(!bkey_cmp(uw.p, uk.p) &&
257                                        !bkey_deleted(&uk));
258                         }
259
260                         if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
261                                 break;
262                 }
263         }
264 }
265
266 #else
267
268 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
269                                                    struct btree *b,
270                                                    struct bkey_packed *k) {}
271
272 #endif
273
274 /* Auxiliary search trees */
275
276 #define BFLOAT_FAILED_UNPACKED  (U8_MAX - 0)
277 #define BFLOAT_FAILED_PREV      (U8_MAX - 1)
278 #define BFLOAT_FAILED_OVERFLOW  (U8_MAX - 2)
279 #define BFLOAT_FAILED           (U8_MAX - 2)
280
281 #define KEY_WORDS               BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
282
283 struct bkey_float {
284         u8              exponent;
285         u8              key_offset;
286         union {
287                 u32     mantissa32;
288         struct {
289                 u16     mantissa16;
290                 u16     _pad;
291         };
292         };
293 } __packed;
294
295 #define BFLOAT_32BIT_NR         32U
296
297 static unsigned bkey_float_byte_offset(unsigned idx)
298 {
299         int d = (idx - BFLOAT_32BIT_NR) << 1;
300
301         d &= ~(d >> 31);
302
303         return idx * 6 - d;
304 }
305
306 struct ro_aux_tree {
307         struct bkey_float       _d[0];
308 };
309
310 struct rw_aux_tree {
311         u16             offset;
312         struct bpos     k;
313 };
314
315 /*
316  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
317  * it used to be 64, but I realized the lookup code would touch slightly less
318  * memory if it was 128.
319  *
320  * It definites the number of bytes (in struct bset) per struct bkey_float in
321  * the auxiliar search tree - when we're done searching the bset_float tree we
322  * have this many bytes left that we do a linear search over.
323  *
324  * Since (after level 5) every level of the bset_tree is on a new cacheline,
325  * we're touching one fewer cacheline in the bset tree in exchange for one more
326  * cacheline in the linear search - but the linear search might stop before it
327  * gets to the second cacheline.
328  */
329
330 #define BSET_CACHELINE          128
331
332 /* Space required for the btree node keys */
333 static inline size_t btree_keys_bytes(struct btree *b)
334 {
335         return PAGE_SIZE << b->page_order;
336 }
337
338 static inline size_t btree_keys_cachelines(struct btree *b)
339 {
340         return btree_keys_bytes(b) / BSET_CACHELINE;
341 }
342
343 static inline size_t btree_aux_data_bytes(struct btree *b)
344 {
345         return btree_keys_cachelines(b) * 8;
346 }
347
348 static inline size_t btree_aux_data_u64s(struct btree *b)
349 {
350         return btree_aux_data_bytes(b) / sizeof(u64);
351 }
352
353 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
354 {
355         BUG_ON(t->aux_data_offset == U16_MAX);
356
357         switch (bset_aux_tree_type(t)) {
358         case BSET_NO_AUX_TREE:
359                 return t->aux_data_offset;
360         case BSET_RO_AUX_TREE:
361                 return t->aux_data_offset +
362                         DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
363                                      sizeof(u8) * t->size, 8);
364         case BSET_RW_AUX_TREE:
365                 return t->aux_data_offset +
366                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
367         default:
368                 BUG();
369         }
370 }
371
372 static unsigned bset_aux_tree_buf_start(const struct btree *b,
373                                         const struct bset_tree *t)
374 {
375         return t == b->set
376                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
377                 : bset_aux_tree_buf_end(t - 1);
378 }
379
380 static void *__aux_tree_base(const struct btree *b,
381                              const struct bset_tree *t)
382 {
383         return b->aux_data + t->aux_data_offset * 8;
384 }
385
386 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
387                                             const struct bset_tree *t)
388 {
389         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
390
391         return __aux_tree_base(b, t);
392 }
393
394 static u8 *ro_aux_tree_prev(const struct btree *b,
395                             const struct bset_tree *t)
396 {
397         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
398
399         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
400 }
401
402 static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
403                                          unsigned idx)
404 {
405         return (void *) b + bkey_float_byte_offset(idx);
406 }
407
408 static struct bkey_float *bkey_float(const struct btree *b,
409                                      const struct bset_tree *t,
410                                      unsigned idx)
411 {
412         return bkey_float_get(ro_aux_tree_base(b, t), idx);
413 }
414
415 static void bset_aux_tree_verify(struct btree *b)
416 {
417 #ifdef CONFIG_BCACHEFS_DEBUG
418         struct bset_tree *t;
419
420         for_each_bset(b, t) {
421                 if (t->aux_data_offset == U16_MAX)
422                         continue;
423
424                 BUG_ON(t != b->set &&
425                        t[-1].aux_data_offset == U16_MAX);
426
427                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
428                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
429                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
430         }
431 #endif
432 }
433
434 /* Memory allocation */
435
436 void bch2_btree_keys_free(struct btree *b)
437 {
438         vfree(b->aux_data);
439         b->aux_data = NULL;
440 }
441
442 #ifndef PAGE_KERNEL_EXEC
443 # define PAGE_KERNEL_EXEC PAGE_KERNEL
444 #endif
445
446 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
447 {
448         b->page_order   = page_order;
449         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
450                                     PAGE_KERNEL_EXEC);
451         if (!b->aux_data)
452                 return -ENOMEM;
453
454         return 0;
455 }
456
457 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
458 {
459         unsigned i;
460
461         b->nsets                = 0;
462         memset(&b->nr, 0, sizeof(b->nr));
463 #ifdef CONFIG_BCACHEFS_DEBUG
464         b->expensive_debug_checks = expensive_debug_checks;
465 #endif
466         for (i = 0; i < MAX_BSETS; i++)
467                 b->set[i].data_offset = U16_MAX;
468
469         bch2_bset_set_no_aux_tree(b, b->set);
470 }
471
472 /* Binary tree stuff for auxiliary search trees */
473
474 /*
475  * Cacheline/offset <-> bkey pointer arithmetic:
476  *
477  * t->tree is a binary search tree in an array; each node corresponds to a key
478  * in one cacheline in t->set (BSET_CACHELINE bytes).
479  *
480  * This means we don't have to store the full index of the key that a node in
481  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
482  * then bkey_float->m gives us the offset within that cacheline, in units of 8
483  * bytes.
484  *
485  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
486  * make this work.
487  *
488  * To construct the bfloat for an arbitrary key we need to know what the key
489  * immediately preceding it is: we have to check if the two keys differ in the
490  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
491  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
492  */
493
494 static inline void *bset_cacheline(const struct btree *b,
495                                    const struct bset_tree *t,
496                                    unsigned cacheline)
497 {
498         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
499                                    L1_CACHE_BYTES) +
500                 cacheline * BSET_CACHELINE;
501 }
502
503 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
504                                              const struct bset_tree *t,
505                                              unsigned cacheline,
506                                              unsigned offset)
507 {
508         return bset_cacheline(b, t, cacheline) + offset * 8;
509 }
510
511 static unsigned bkey_to_cacheline(const struct btree *b,
512                                   const struct bset_tree *t,
513                                   const struct bkey_packed *k)
514 {
515         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
516 }
517
518 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
519                                           const struct bset_tree *t,
520                                           unsigned cacheline,
521                                           const struct bkey_packed *k)
522 {
523         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
524 }
525
526 static unsigned bkey_to_cacheline_offset(const struct btree *b,
527                                          const struct bset_tree *t,
528                                          unsigned cacheline,
529                                          const struct bkey_packed *k)
530 {
531         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
532
533         EBUG_ON(m > U8_MAX);
534         return m;
535 }
536
537 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
538                                                const struct bset_tree *t,
539                                                unsigned j)
540 {
541         return cacheline_to_bkey(b, t,
542                         __eytzinger1_to_inorder(j, t->size, t->extra),
543                         bkey_float(b, t, j)->key_offset);
544 }
545
546 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
547                                              const struct bset_tree *t,
548                                              unsigned j)
549 {
550         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
551
552         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
553 }
554
555 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
556                                        const struct bset_tree *t)
557 {
558         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
559
560         return __aux_tree_base(b, t);
561 }
562
563 /*
564  * For the write set - the one we're currently inserting keys into - we don't
565  * maintain a full search tree, we just keep a simple lookup table in t->prev.
566  */
567 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
568                                           struct bset_tree *t,
569                                           unsigned j)
570 {
571         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
572 }
573
574 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
575                             unsigned j, struct bkey_packed *k)
576 {
577         EBUG_ON(k >= btree_bkey_last(b, t));
578
579         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
580                 .offset = __btree_node_key_to_offset(b, k),
581                 .k      = bkey_unpack_pos(b, k),
582         };
583 }
584
585 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
586                                         struct bset_tree *t)
587 {
588         struct bkey_packed *k = btree_bkey_first(b, t);
589         unsigned j = 0;
590
591         if (!btree_keys_expensive_checks(b))
592                 return;
593
594         BUG_ON(bset_has_ro_aux_tree(t));
595
596         if (!bset_has_rw_aux_tree(t))
597                 return;
598
599         BUG_ON(t->size < 1);
600         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
601
602         goto start;
603         while (1) {
604                 if (rw_aux_to_bkey(b, t, j) == k) {
605                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
606                                         bkey_unpack_pos(b, k)));
607 start:
608                         if (++j == t->size)
609                                 break;
610
611                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
612                                rw_aux_tree(b, t)[j - 1].offset);
613                 }
614
615                 k = bkey_next(k);
616                 BUG_ON(k >= btree_bkey_last(b, t));
617         }
618 }
619
620 /* returns idx of first entry >= offset: */
621 static unsigned rw_aux_tree_bsearch(struct btree *b,
622                                     struct bset_tree *t,
623                                     unsigned offset)
624 {
625         unsigned l = 0, r = t->size;
626
627         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
628
629         while (l < r) {
630                 unsigned m = (l + r) >> 1;
631
632                 if (rw_aux_tree(b, t)[m].offset < offset)
633                         l = m + 1;
634                 else
635                         r = m;
636         }
637
638         EBUG_ON(l < t->size &&
639                 rw_aux_tree(b, t)[l].offset < offset);
640         EBUG_ON(l &&
641                 rw_aux_tree(b, t)[l - 1].offset >= offset);
642
643         EBUG_ON(l > r);
644         EBUG_ON(l > t->size);
645
646         return l;
647 }
648
649 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
650                                        unsigned idx)
651 {
652         return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
653 }
654
655 static inline void bfloat_mantissa_set(struct bkey_float *f,
656                                        unsigned idx, unsigned mantissa)
657 {
658         if (idx < BFLOAT_32BIT_NR)
659                 f->mantissa32 = mantissa;
660         else
661                 f->mantissa16 = mantissa;
662 }
663
664 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
665                                      const struct bkey_float *f,
666                                      unsigned idx)
667 {
668         u64 v;
669
670         EBUG_ON(!bkey_packed(k));
671
672         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
673
674         /*
675          * In little endian, we're shifting off low bits (and then the bits we
676          * want are at the low end), in big endian we're shifting off high bits
677          * (and then the bits we want are at the high end, so we shift them
678          * back down):
679          */
680 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
681         v >>= f->exponent & 7;
682 #else
683         v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
684 #endif
685         return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
686 }
687
688 static void make_bfloat(struct btree *b, struct bset_tree *t,
689                         unsigned j,
690                         struct bkey_packed *min_key,
691                         struct bkey_packed *max_key)
692 {
693         struct bkey_float *f = bkey_float(b, t, j);
694         struct bkey_packed *m = tree_to_bkey(b, t, j);
695         struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
696         struct bkey_packed *l, *r;
697         unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
698         unsigned mantissa;
699         int shift, exponent, high_bit;
700
701         EBUG_ON(bkey_next(p) != m);
702
703         if (is_power_of_2(j)) {
704                 l = min_key;
705
706                 if (!l->u64s) {
707                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
708                                 struct bkey_i tmp;
709
710                                 bkey_init(&tmp.k);
711                                 tmp.k.p = b->data->min_key;
712                                 bkey_copy(l, &tmp);
713                         }
714                 }
715         } else {
716                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
717
718                 EBUG_ON(m < l);
719         }
720
721         if (is_power_of_2(j + 1)) {
722                 r = max_key;
723
724                 if (!r->u64s) {
725                         if (!bkey_pack_pos(r, t->max_key, b)) {
726                                 struct bkey_i tmp;
727
728                                 bkey_init(&tmp.k);
729                                 tmp.k.p = t->max_key;
730                                 bkey_copy(r, &tmp);
731                         }
732                 }
733         } else {
734                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
735
736                 EBUG_ON(m > r);
737         }
738
739         /*
740          * for failed bfloats, the lookup code falls back to comparing against
741          * the original key.
742          */
743
744         if (!bkey_packed(l) || !bkey_packed(r) ||
745             !bkey_packed(p) || !bkey_packed(m) ||
746             !b->nr_key_bits) {
747                 f->exponent = BFLOAT_FAILED_UNPACKED;
748                 return;
749         }
750
751         /*
752          * The greatest differing bit of l and r is the first bit we must
753          * include in the bfloat mantissa we're creating in order to do
754          * comparisons - that bit always becomes the high bit of
755          * bfloat->mantissa, and thus the exponent we're calculating here is
756          * the position of what will become the low bit in bfloat->mantissa:
757          *
758          * Note that this may be negative - we may be running off the low end
759          * of the key: we handle this later:
760          */
761         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
762                        min_t(unsigned, bits, b->nr_key_bits) - 1);
763         exponent = high_bit - (bits - 1);
764
765         /*
766          * Then we calculate the actual shift value, from the start of the key
767          * (k->_data), to get the key bits starting at exponent:
768          */
769 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
770         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
771
772         EBUG_ON(shift + bits > b->format.key_u64s * 64);
773 #else
774         shift = high_bit_offset +
775                 b->nr_key_bits -
776                 exponent -
777                 bits;
778
779         EBUG_ON(shift < KEY_PACKED_BITS_START);
780 #endif
781         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
782
783         f->exponent = shift;
784         mantissa = bkey_mantissa(m, f, j);
785
786         /*
787          * If we've got garbage bits, set them to all 1s - it's legal for the
788          * bfloat to compare larger than the original key, but not smaller:
789          */
790         if (exponent < 0)
791                 mantissa |= ~(~0U << -exponent);
792
793         bfloat_mantissa_set(f, j, mantissa);
794
795         /*
796          * The bfloat must be able to tell its key apart from the previous key -
797          * if its key and the previous key don't differ in the required bits,
798          * flag as failed - unless the keys are actually equal, in which case
799          * we aren't required to return a specific one:
800          */
801         if (exponent > 0 &&
802             bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
803             bkey_cmp_packed(b, p, m)) {
804                 f->exponent = BFLOAT_FAILED_PREV;
805                 return;
806         }
807
808         /*
809          * f->mantissa must compare >= the original key - for transitivity with
810          * the comparison in bset_search_tree. If we're dropping set bits,
811          * increment it:
812          */
813         if (exponent > (int) bch2_bkey_ffs(b, m)) {
814                 if (j < BFLOAT_32BIT_NR
815                     ? f->mantissa32 == U32_MAX
816                     : f->mantissa16 == U16_MAX)
817                         f->exponent = BFLOAT_FAILED_OVERFLOW;
818
819                 if (j < BFLOAT_32BIT_NR)
820                         f->mantissa32++;
821                 else
822                         f->mantissa16++;
823         }
824 }
825
826 /* bytes remaining - only valid for last bset: */
827 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
828 {
829         bset_aux_tree_verify(b);
830
831         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
832 }
833
834 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
835 {
836         unsigned bytes = __bset_tree_capacity(b, t);
837
838         if (bytes < 7 * BFLOAT_32BIT_NR)
839                 return bytes / 7;
840
841         bytes -= 7 * BFLOAT_32BIT_NR;
842
843         return BFLOAT_32BIT_NR + bytes / 5;
844 }
845
846 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
847 {
848         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
849 }
850
851 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
852 {
853         struct bkey_packed *k;
854
855         t->size = 1;
856         t->extra = BSET_RW_AUX_TREE_VAL;
857         rw_aux_tree(b, t)[0].offset =
858                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
859
860         for (k = btree_bkey_first(b, t);
861              k != btree_bkey_last(b, t);
862              k = bkey_next(k)) {
863                 if (t->size == bset_rw_tree_capacity(b, t))
864                         break;
865
866                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
867                     L1_CACHE_BYTES)
868                         rw_aux_tree_set(b, t, t->size++, k);
869         }
870 }
871
872 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
873 {
874         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
875         struct bkey_packed min_key, max_key;
876         unsigned j, cacheline = 1;
877
878         /* signal to make_bfloat() that they're uninitialized: */
879         min_key.u64s = max_key.u64s = 0;
880
881         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
882                       bset_ro_tree_capacity(b, t));
883 retry:
884         if (t->size < 2) {
885                 t->size = 0;
886                 t->extra = BSET_NO_AUX_TREE_VAL;
887                 return;
888         }
889
890         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
891
892         /* First we figure out where the first key in each cacheline is */
893         eytzinger1_for_each(j, t->size) {
894                 while (bkey_to_cacheline(b, t, k) < cacheline)
895                         prev = k, k = bkey_next(k);
896
897                 if (k >= btree_bkey_last(b, t)) {
898                         /* XXX: this path sucks */
899                         t->size--;
900                         goto retry;
901                 }
902
903                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
904                 bkey_float(b, t, j)->key_offset =
905                         bkey_to_cacheline_offset(b, t, cacheline++, k);
906
907                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
908                 EBUG_ON(tree_to_bkey(b, t, j) != k);
909         }
910
911         while (bkey_next(k) != btree_bkey_last(b, t))
912                 k = bkey_next(k);
913
914         t->max_key = bkey_unpack_pos(b, k);
915
916         /* Then we build the tree */
917         eytzinger1_for_each(j, t->size)
918                 make_bfloat(b, t, j, &min_key, &max_key);
919 }
920
921 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
922 {
923         struct bset_tree *i;
924
925         for (i = b->set; i != t; i++)
926                 BUG_ON(bset_has_rw_aux_tree(i));
927
928         bch2_bset_set_no_aux_tree(b, t);
929
930         /* round up to next cacheline: */
931         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
932                                       SMP_CACHE_BYTES / sizeof(u64));
933
934         bset_aux_tree_verify(b);
935 }
936
937 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
938                              bool writeable)
939 {
940         if (writeable
941             ? bset_has_rw_aux_tree(t)
942             : bset_has_ro_aux_tree(t))
943                 return;
944
945         bset_alloc_tree(b, t);
946
947         if (!__bset_tree_capacity(b, t))
948                 return;
949
950         if (writeable)
951                 __build_rw_aux_tree(b, t);
952         else
953                 __build_ro_aux_tree(b, t);
954
955         bset_aux_tree_verify(b);
956 }
957
958 void bch2_bset_init_first(struct btree *b, struct bset *i)
959 {
960         struct bset_tree *t;
961
962         BUG_ON(b->nsets);
963
964         memset(i, 0, sizeof(*i));
965         get_random_bytes(&i->seq, sizeof(i->seq));
966         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
967
968         t = &b->set[b->nsets++];
969         set_btree_bset(b, t, i);
970 }
971
972 void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
973                          struct btree_node_entry *bne)
974 {
975         struct bset *i = &bne->keys;
976         struct bset_tree *t;
977
978         BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
979         BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
980         BUG_ON(b->nsets >= MAX_BSETS);
981
982         memset(i, 0, sizeof(*i));
983         i->seq = btree_bset_first(b)->seq;
984         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
985
986         t = &b->set[b->nsets++];
987         set_btree_bset(b, t, i);
988 }
989
990 /*
991  * find _some_ key in the same bset as @k that precedes @k - not necessarily the
992  * immediate predecessor:
993  */
994 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
995                                        struct bkey_packed *k)
996 {
997         struct bkey_packed *p;
998         unsigned offset;
999         int j;
1000
1001         EBUG_ON(k < btree_bkey_first(b, t) ||
1002                 k > btree_bkey_last(b, t));
1003
1004         if (k == btree_bkey_first(b, t))
1005                 return NULL;
1006
1007         switch (bset_aux_tree_type(t)) {
1008         case BSET_NO_AUX_TREE:
1009                 p = btree_bkey_first(b, t);
1010                 break;
1011         case BSET_RO_AUX_TREE:
1012                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
1013
1014                 do {
1015                         p = j ? tree_to_bkey(b, t,
1016                                         __inorder_to_eytzinger1(j--,
1017                                                         t->size, t->extra))
1018                               : btree_bkey_first(b, t);
1019                 } while (p >= k);
1020                 break;
1021         case BSET_RW_AUX_TREE:
1022                 offset = __btree_node_key_to_offset(b, k);
1023                 j = rw_aux_tree_bsearch(b, t, offset);
1024                 p = j ? rw_aux_to_bkey(b, t, j - 1)
1025                       : btree_bkey_first(b, t);
1026                 break;
1027         }
1028
1029         return p;
1030 }
1031
1032 struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
1033                                           struct bset_tree *t,
1034                                           struct bkey_packed *k,
1035                                           unsigned min_key_type)
1036 {
1037         struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
1038
1039         while ((p = __bkey_prev(b, t, k)) && !ret) {
1040                 for (i = p; i != k; i = bkey_next(i))
1041                         if (i->type >= min_key_type)
1042                                 ret = i;
1043
1044                 k = p;
1045         }
1046
1047         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1048                 BUG_ON(ret >= orig_k);
1049
1050                 for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
1051                      i != orig_k;
1052                      i = bkey_next(i))
1053                         BUG_ON(i->type >= min_key_type);
1054         }
1055
1056         return ret;
1057 }
1058
1059 /* Insert */
1060
1061 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1062                                             struct bset_tree *t,
1063                                             struct bkey_packed *k)
1064 {
1065         unsigned offset = __btree_node_key_to_offset(b, k);
1066         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1067
1068         if (j < t->size &&
1069             rw_aux_tree(b, t)[j].offset == offset)
1070                 rw_aux_tree_set(b, t, j, k);
1071
1072         bch2_bset_verify_rw_aux_tree(b, t);
1073 }
1074
1075 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1076                                             struct bset_tree *t,
1077                                             struct bkey_packed *k)
1078 {
1079         struct bkey_packed min_key, max_key;
1080         unsigned inorder, j;
1081
1082         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1083
1084         /* signal to make_bfloat() that they're uninitialized: */
1085         min_key.u64s = max_key.u64s = 0;
1086
1087         if (bkey_next(k) == btree_bkey_last(b, t)) {
1088                 t->max_key = bkey_unpack_pos(b, k);
1089
1090                 for (j = 1; j < t->size; j = j * 2 + 1)
1091                         make_bfloat(b, t, j, &min_key, &max_key);
1092         }
1093
1094         inorder = bkey_to_cacheline(b, t, k);
1095
1096         if (inorder &&
1097             inorder < t->size) {
1098                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1099
1100                 if (k == tree_to_bkey(b, t, j)) {
1101                         /* Fix the node this key corresponds to */
1102                         make_bfloat(b, t, j, &min_key, &max_key);
1103
1104                         /* Children for which this key is the right boundary */
1105                         for (j = eytzinger1_left_child(j);
1106                              j < t->size;
1107                              j = eytzinger1_right_child(j))
1108                                 make_bfloat(b, t, j, &min_key, &max_key);
1109                 }
1110         }
1111
1112         if (inorder + 1 < t->size) {
1113                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1114
1115                 if (k == tree_to_prev_bkey(b, t, j)) {
1116                         make_bfloat(b, t, j, &min_key, &max_key);
1117
1118                         /* Children for which this key is the left boundary */
1119                         for (j = eytzinger1_right_child(j);
1120                              j < t->size;
1121                              j = eytzinger1_left_child(j))
1122                                 make_bfloat(b, t, j, &min_key, &max_key);
1123                 }
1124         }
1125 }
1126
1127 /**
1128  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1129  * modified, fix any auxiliary search tree by remaking all the nodes in the
1130  * auxiliary search tree that @k corresponds to
1131  */
1132 void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
1133                                    struct bkey_packed *k)
1134 {
1135         switch (bset_aux_tree_type(t)) {
1136         case BSET_NO_AUX_TREE:
1137                 break;
1138         case BSET_RO_AUX_TREE:
1139                 ro_aux_tree_fix_invalidated_key(b, t, k);
1140                 break;
1141         case BSET_RW_AUX_TREE:
1142                 rw_aux_tree_fix_invalidated_key(b, t, k);
1143                 break;
1144         }
1145 }
1146
1147 static void bch2_bset_fix_lookup_table(struct btree *b,
1148                                        struct bset_tree *t,
1149                                        struct bkey_packed *_where,
1150                                        unsigned clobber_u64s,
1151                                        unsigned new_u64s)
1152 {
1153         int shift = new_u64s - clobber_u64s;
1154         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1155
1156         EBUG_ON(bset_has_ro_aux_tree(t));
1157
1158         if (!bset_has_rw_aux_tree(t))
1159                 return;
1160
1161         l = rw_aux_tree_bsearch(b, t, where);
1162
1163         /* l is first >= than @where */
1164
1165         EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
1166         EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
1167
1168         if (!l) /* never delete first entry */
1169                 l++;
1170         else if (l < t->size &&
1171                  where < t->end_offset &&
1172                  rw_aux_tree(b, t)[l].offset == where)
1173                 rw_aux_tree_set(b, t, l++, _where);
1174
1175         /* l now > where */
1176
1177         for (j = l;
1178              j < t->size &&
1179              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1180              j++)
1181                 ;
1182
1183         if (j < t->size &&
1184             rw_aux_tree(b, t)[j].offset + shift ==
1185             rw_aux_tree(b, t)[l - 1].offset)
1186                 j++;
1187
1188         memmove(&rw_aux_tree(b, t)[l],
1189                 &rw_aux_tree(b, t)[j],
1190                 (void *) &rw_aux_tree(b, t)[t->size] -
1191                 (void *) &rw_aux_tree(b, t)[j]);
1192         t->size -= j - l;
1193
1194         for (j = l; j < t->size; j++)
1195                rw_aux_tree(b, t)[j].offset += shift;
1196
1197         EBUG_ON(l < t->size &&
1198                 rw_aux_tree(b, t)[l].offset ==
1199                 rw_aux_tree(b, t)[l - 1].offset);
1200
1201         if (t->size < bset_rw_tree_capacity(b, t) &&
1202             (l < t->size
1203              ? rw_aux_tree(b, t)[l].offset
1204              : t->end_offset) -
1205             rw_aux_tree(b, t)[l - 1].offset >
1206             L1_CACHE_BYTES / sizeof(u64)) {
1207                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1208                 struct bkey_packed *end = l < t->size
1209                         ? rw_aux_to_bkey(b, t, l)
1210                         : btree_bkey_last(b, t);
1211                 struct bkey_packed *k = start;
1212
1213                 while (1) {
1214                         k = bkey_next(k);
1215                         if (k == end)
1216                                 break;
1217
1218                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1219                                 memmove(&rw_aux_tree(b, t)[l + 1],
1220                                         &rw_aux_tree(b, t)[l],
1221                                         (void *) &rw_aux_tree(b, t)[t->size] -
1222                                         (void *) &rw_aux_tree(b, t)[l]);
1223                                 t->size++;
1224                                 rw_aux_tree_set(b, t, l, k);
1225                                 break;
1226                         }
1227                 }
1228         }
1229
1230         bch2_bset_verify_rw_aux_tree(b, t);
1231         bset_aux_tree_verify(b);
1232 }
1233
1234 void bch2_bset_insert(struct btree *b,
1235                       struct btree_node_iter *iter,
1236                       struct bkey_packed *where,
1237                       struct bkey_i *insert,
1238                       unsigned clobber_u64s)
1239 {
1240         struct bkey_format *f = &b->format;
1241         struct bset_tree *t = bset_tree_last(b);
1242         struct bkey_packed packed, *src = bkey_to_packed(insert);
1243
1244         bch2_bset_verify_rw_aux_tree(b, t);
1245
1246         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1247                 src = &packed;
1248
1249         if (!bkey_whiteout(&insert->k))
1250                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1251
1252         if (src->u64s != clobber_u64s) {
1253                 u64 *src_p = where->_data + clobber_u64s;
1254                 u64 *dst_p = where->_data + src->u64s;
1255
1256                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1257                         (int) clobber_u64s - src->u64s);
1258
1259                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1260                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1261                 set_btree_bset_end(b, t);
1262         }
1263
1264         memcpy_u64s(where, src,
1265                     bkeyp_key_u64s(f, src));
1266         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1267                     bkeyp_val_u64s(f, src));
1268
1269         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1270
1271         bch2_verify_key_order(b, iter, where);
1272         bch2_verify_btree_nr_keys(b);
1273 }
1274
1275 void bch2_bset_delete(struct btree *b,
1276                       struct bkey_packed *where,
1277                       unsigned clobber_u64s)
1278 {
1279         struct bset_tree *t = bset_tree_last(b);
1280         u64 *src_p = where->_data + clobber_u64s;
1281         u64 *dst_p = where->_data;
1282
1283         bch2_bset_verify_rw_aux_tree(b, t);
1284
1285         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1286
1287         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1288         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1289         set_btree_bset_end(b, t);
1290
1291         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1292 }
1293
1294 /* Lookup */
1295
1296 __flatten
1297 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1298                                 struct bset_tree *t,
1299                                 struct bpos search,
1300                                 const struct bkey_packed *packed_search)
1301 {
1302         unsigned l = 0, r = t->size;
1303
1304         while (l + 1 != r) {
1305                 unsigned m = (l + r) >> 1;
1306
1307                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
1308                         l = m;
1309                 else
1310                         r = m;
1311         }
1312
1313         return rw_aux_to_bkey(b, t, l);
1314 }
1315
1316 noinline
1317 static int bset_search_tree_slowpath(const struct btree *b,
1318                                 struct bset_tree *t, struct bpos *search,
1319                                 const struct bkey_packed *packed_search,
1320                                 unsigned n)
1321 {
1322         return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
1323                                  packed_search, search) < 0;
1324 }
1325
1326 __flatten
1327 static struct bkey_packed *bset_search_tree(const struct btree *b,
1328                                 struct bset_tree *t,
1329                                 struct bpos search,
1330                                 const struct bkey_packed *packed_search)
1331 {
1332         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1333         struct bkey_float *f = bkey_float_get(base, 1);
1334         void *p;
1335         unsigned inorder, n = 1;
1336
1337         while (1) {
1338                 if (likely(n << 4 < t->size)) {
1339                         p = bkey_float_get(base, n << 4);
1340                         prefetch(p);
1341                 } else if (n << 3 < t->size) {
1342                         inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
1343                         p = bset_cacheline(b, t, inorder);
1344 #ifdef CONFIG_X86_64
1345                         asm(".intel_syntax noprefix;"
1346                             "prefetcht0 [%0 - 127 + 64 * 0];"
1347                             "prefetcht0 [%0 - 127 + 64 * 1];"
1348                             "prefetcht0 [%0 - 127 + 64 * 2];"
1349                             "prefetcht0 [%0 - 127 + 64 * 3];"
1350                             ".att_syntax prefix;"
1351                             :
1352                             : "r" (p + 127));
1353 #else
1354                         prefetch(p + L1_CACHE_BYTES * 0);
1355                         prefetch(p + L1_CACHE_BYTES * 1);
1356                         prefetch(p + L1_CACHE_BYTES * 2);
1357                         prefetch(p + L1_CACHE_BYTES * 3);
1358 #endif
1359                 } else if (n >= t->size)
1360                         break;
1361
1362                 f = bkey_float_get(base, n);
1363
1364                 if (packed_search &&
1365                     likely(f->exponent < BFLOAT_FAILED))
1366                         n = n * 2 + (bfloat_mantissa(f, n) <
1367                                      bkey_mantissa(packed_search, f, n));
1368                 else
1369                         n = n * 2 + bset_search_tree_slowpath(b, t,
1370                                                 &search, packed_search, n);
1371         } while (n < t->size);
1372
1373         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1374
1375         /*
1376          * n would have been the node we recursed to - the low bit tells us if
1377          * we recursed left or recursed right.
1378          */
1379         if (n & 1) {
1380                 return cacheline_to_bkey(b, t, inorder, f->key_offset);
1381         } else {
1382                 if (--inorder) {
1383                         n = eytzinger1_prev(n >> 1, t->size);
1384                         f = bkey_float_get(base, n);
1385                         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1386                 } else
1387                         return btree_bkey_first(b, t);
1388         }
1389 }
1390
1391 /*
1392  * Returns the first key greater than or equal to @search
1393  */
1394 __always_inline __flatten
1395 static struct bkey_packed *bch2_bset_search(struct btree *b,
1396                                 struct bset_tree *t,
1397                                 struct bpos search,
1398                                 struct bkey_packed *packed_search,
1399                                 const struct bkey_packed *lossy_packed_search,
1400                                 bool strictly_greater)
1401 {
1402         struct bkey_packed *m;
1403
1404         /*
1405          * First, we search for a cacheline, then lastly we do a linear search
1406          * within that cacheline.
1407          *
1408          * To search for the cacheline, there's three different possibilities:
1409          *  * The set is too small to have a search tree, so we just do a linear
1410          *    search over the whole set.
1411          *  * The set is the one we're currently inserting into; keeping a full
1412          *    auxiliary search tree up to date would be too expensive, so we
1413          *    use a much simpler lookup table to do a binary search -
1414          *    bset_search_write_set().
1415          *  * Or we use the auxiliary search tree we constructed earlier -
1416          *    bset_search_tree()
1417          */
1418
1419         switch (bset_aux_tree_type(t)) {
1420         case BSET_NO_AUX_TREE:
1421                 m = btree_bkey_first(b, t);
1422                 break;
1423         case BSET_RW_AUX_TREE:
1424                 m = bset_search_write_set(b, t, search, lossy_packed_search);
1425                 break;
1426         case BSET_RO_AUX_TREE:
1427                 /*
1428                  * Each node in the auxiliary search tree covers a certain range
1429                  * of bits, and keys above and below the set it covers might
1430                  * differ outside those bits - so we have to special case the
1431                  * start and end - handle that here:
1432                  */
1433
1434                 if (bkey_cmp(search, t->max_key) > 0)
1435                         return btree_bkey_last(b, t);
1436
1437                 m = bset_search_tree(b, t, search, lossy_packed_search);
1438                 break;
1439         }
1440
1441         if (lossy_packed_search)
1442                 while (m != btree_bkey_last(b, t) &&
1443                        !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
1444                                                     m, strictly_greater))
1445                         m = bkey_next(m);
1446
1447         if (!packed_search)
1448                 while (m != btree_bkey_last(b, t) &&
1449                        !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
1450                         m = bkey_next(m);
1451
1452         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1453                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1454
1455                 BUG_ON(prev &&
1456                        btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
1457                                                    prev, strictly_greater));
1458         }
1459
1460         return m;
1461 }
1462
1463 /* Btree node iterator */
1464
1465 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1466                                struct btree *b,
1467                                const struct bkey_packed *k,
1468                                const struct bkey_packed *end)
1469 {
1470         __bch2_btree_node_iter_push(iter, b, k, end);
1471         bch2_btree_node_iter_sort(iter, b);
1472 }
1473
1474 noinline __flatten __attribute__((cold))
1475 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1476                               struct btree *b, struct bpos search,
1477                               bool strictly_greater, bool is_extents)
1478 {
1479         struct bset_tree *t;
1480
1481         trace_bkey_pack_pos_fail(search);
1482
1483         for_each_bset(b, t)
1484                 __bch2_btree_node_iter_push(iter, b,
1485                         bch2_bset_search(b, t, search, NULL, NULL,
1486                                         strictly_greater),
1487                         btree_bkey_last(b, t));
1488
1489         bch2_btree_node_iter_sort(iter, b);
1490 }
1491
1492 /**
1493  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1494  * given position
1495  *
1496  * Main entry point to the lookup code for individual btree nodes:
1497  *
1498  * NOTE:
1499  *
1500  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1501  * keys. This doesn't matter for most code, but it does matter for lookups.
1502  *
1503  * Some adjacent keys with a string of equal keys:
1504  *      i j k k k k l m
1505  *
1506  * If you search for k, the lookup code isn't guaranteed to return you any
1507  * specific k. The lookup code is conceptually doing a binary search and
1508  * iterating backwards is very expensive so if the pivot happens to land at the
1509  * last k that's what you'll get.
1510  *
1511  * This works out ok, but it's something to be aware of:
1512  *
1513  *  - For non extents, we guarantee that the live key comes last - see
1514  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1515  *    see will only be deleted keys you don't care about.
1516  *
1517  *  - For extents, deleted keys sort last (see the comment at the top of this
1518  *    file). But when you're searching for extents, you actually want the first
1519  *    key strictly greater than your search key - an extent that compares equal
1520  *    to the search key is going to have 0 sectors after the search key.
1521  *
1522  *    But this does mean that we can't just search for
1523  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1524  *    the range we want - if we're unlucky and there's an extent that ends
1525  *    exactly where we searched, then there could be a deleted key at the same
1526  *    position and we'd get that when we search instead of the preceding extent
1527  *    we needed.
1528  *
1529  *    So we've got to search for start_of_range, then after the lookup iterate
1530  *    past any extents that compare equal to the position we searched for.
1531  */
1532 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1533                                struct btree *b, struct bpos search,
1534                                bool strictly_greater, bool is_extents)
1535 {
1536         struct bset_tree *t;
1537         struct bkey_packed p, *packed_search = NULL;
1538
1539         EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
1540         bset_aux_tree_verify(b);
1541
1542         __bch2_btree_node_iter_init(iter, is_extents);
1543
1544         switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
1545         case BKEY_PACK_POS_EXACT:
1546                 packed_search = &p;
1547                 break;
1548         case BKEY_PACK_POS_SMALLER:
1549                 packed_search = NULL;
1550                 break;
1551         case BKEY_PACK_POS_FAIL:
1552                 btree_node_iter_init_pack_failed(iter, b, search,
1553                                         strictly_greater, is_extents);
1554                 return;
1555         }
1556
1557         for_each_bset(b, t)
1558                 __bch2_btree_node_iter_push(iter, b,
1559                                            bch2_bset_search(b, t, search,
1560                                                            packed_search, &p,
1561                                                            strictly_greater),
1562                                            btree_bkey_last(b, t));
1563
1564         bch2_btree_node_iter_sort(iter, b);
1565 }
1566
1567 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1568                                           struct btree *b,
1569                                           bool is_extents)
1570 {
1571         struct bset_tree *t;
1572
1573         __bch2_btree_node_iter_init(iter, is_extents);
1574
1575         for_each_bset(b, t)
1576                 __bch2_btree_node_iter_push(iter, b,
1577                                            btree_bkey_first(b, t),
1578                                            btree_bkey_last(b, t));
1579         bch2_btree_node_iter_sort(iter, b);
1580 }
1581
1582 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1583                                                   struct btree *b,
1584                                                   struct bset_tree *t)
1585 {
1586         struct btree_node_iter_set *set;
1587
1588         btree_node_iter_for_each(iter, set)
1589                 if (set->end == t->end_offset)
1590                         return __btree_node_offset_to_key(b, set->k);
1591
1592         return btree_bkey_last(b, t);
1593 }
1594
1595 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1596                                             struct btree *b,
1597                                             unsigned first)
1598 {
1599         bool ret;
1600
1601         if ((ret = (btree_node_iter_cmp(iter, b,
1602                                         iter->data[first],
1603                                         iter->data[first + 1]) > 0)))
1604                 swap(iter->data[first], iter->data[first + 1]);
1605         return ret;
1606 }
1607
1608 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1609                                struct btree *b)
1610 {
1611         /* unrolled bubble sort: */
1612
1613         if (!__btree_node_iter_set_end(iter, 2)) {
1614                 btree_node_iter_sort_two(iter, b, 0);
1615                 btree_node_iter_sort_two(iter, b, 1);
1616         }
1617
1618         if (!__btree_node_iter_set_end(iter, 1))
1619                 btree_node_iter_sort_two(iter, b, 0);
1620 }
1621
1622 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1623                                    struct btree_node_iter_set *set)
1624 {
1625         struct btree_node_iter_set *last =
1626                 iter->data + ARRAY_SIZE(iter->data) - 1;
1627
1628         memmove(&set[0], &set[1], (void *) last - (void *) set);
1629         *last = (struct btree_node_iter_set) { 0, 0 };
1630 }
1631
1632 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1633                                                   struct btree *b)
1634 {
1635         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1636
1637         EBUG_ON(iter->data->k > iter->data->end);
1638
1639         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1640                 bch2_btree_node_iter_set_drop(iter, iter->data);
1641                 return;
1642         }
1643
1644         if (__btree_node_iter_set_end(iter, 1))
1645                 return;
1646
1647         if (!btree_node_iter_sort_two(iter, b, 0))
1648                 return;
1649
1650         if (__btree_node_iter_set_end(iter, 2))
1651                 return;
1652
1653         btree_node_iter_sort_two(iter, b, 1);
1654 }
1655
1656 /**
1657  * bch_btree_node_iter_advance - advance @iter by one key
1658  *
1659  * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
1660  * momentarily have out of order extents.
1661  */
1662 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1663                                   struct btree *b)
1664 {
1665 #ifdef CONFIG_BCACHEFS_DEBUG
1666         struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
1667
1668         __bch2_btree_node_iter_advance(iter, b);
1669         bch2_btree_node_iter_next_check(iter, b, k);
1670 #else
1671         __bch2_btree_node_iter_advance(iter, b);
1672 #endif
1673 }
1674
1675 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
1676 {
1677         unsigned n = ARRAY_SIZE(iter->data);
1678
1679         while (n && __btree_node_iter_set_end(iter, n - 1))
1680                 --n;
1681
1682         return n;
1683 }
1684
1685 /*
1686  * Expensive:
1687  */
1688 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
1689                                                      struct btree *b,
1690                                                      unsigned min_key_type)
1691 {
1692         struct bkey_packed *k, *prev = NULL;
1693         struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
1694         struct btree_node_iter_set *set;
1695         struct bset_tree *t;
1696         unsigned end;
1697
1698         bch2_btree_node_iter_verify(iter, b);
1699
1700         for_each_bset(b, t) {
1701                 k = bch2_bkey_prev_filter(b, t,
1702                         bch2_btree_node_iter_bset_pos(iter, b, t),
1703                         min_key_type);
1704                 if (k &&
1705                     (!prev || __btree_node_iter_cmp(iter->is_extents, b,
1706                                                     k, prev) > 0)) {
1707                         prev = k;
1708                         end = t->end_offset;
1709                 }
1710         }
1711
1712         if (!prev)
1713                 goto out;
1714
1715         /*
1716          * We're manually memmoving instead of just calling sort() to ensure the
1717          * prev we picked ends up in slot 0 - sort won't necessarily put it
1718          * there because of duplicate deleted keys:
1719          */
1720         btree_node_iter_for_each(iter, set)
1721                 if (set->end == end)
1722                         goto found;
1723
1724         BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
1725 found:
1726         BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
1727
1728         memmove(&iter->data[1],
1729                 &iter->data[0],
1730                 (void *) set - (void *) &iter->data[0]);
1731
1732         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1733         iter->data[0].end = end;
1734 out:
1735         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1736                 struct btree_node_iter iter2 = *iter;
1737
1738                 if (prev)
1739                         bch2_btree_node_iter_advance(&iter2, b);
1740
1741                 while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
1742                         BUG_ON(k->type >= min_key_type);
1743                         bch2_btree_node_iter_advance(&iter2, b);
1744                 }
1745         }
1746
1747         return prev;
1748 }
1749
1750 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1751                                                  struct btree *b,
1752                                                  struct bkey *u)
1753 {
1754         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1755
1756         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1757 }
1758
1759 /* Mergesort */
1760
1761 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1762 {
1763         struct bset_tree *t;
1764
1765         for_each_bset(b, t) {
1766                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1767                 size_t j;
1768
1769                 stats->sets[type].nr++;
1770                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1771                         sizeof(u64);
1772
1773                 if (bset_has_ro_aux_tree(t)) {
1774                         stats->floats += t->size - 1;
1775
1776                         for (j = 1; j < t->size; j++)
1777                                 switch (bkey_float(b, t, j)->exponent) {
1778                                 case BFLOAT_FAILED_UNPACKED:
1779                                         stats->failed_unpacked++;
1780                                         break;
1781                                 case BFLOAT_FAILED_PREV:
1782                                         stats->failed_prev++;
1783                                         break;
1784                                 case BFLOAT_FAILED_OVERFLOW:
1785                                         stats->failed_overflow++;
1786                                         break;
1787                                 }
1788                 }
1789         }
1790 }
1791
1792 int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
1793                            char *buf, size_t size)
1794 {
1795         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1796         struct bkey_packed *l, *r, *p;
1797         struct bkey uk, up;
1798         char buf1[200], buf2[200];
1799         unsigned j;
1800
1801         if (!size)
1802                 return 0;
1803
1804         if (!bset_has_ro_aux_tree(t))
1805                 goto out;
1806
1807         j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
1808         if (j &&
1809             j < t->size &&
1810             k == tree_to_bkey(b, t, j))
1811                 switch (bkey_float(b, t, j)->exponent) {
1812                 case BFLOAT_FAILED_UNPACKED:
1813                         uk = bkey_unpack_key(b, k);
1814                         return scnprintf(buf, size,
1815                                          "    failed unpacked at depth %u\n"
1816                                          "\t%llu:%llu\n",
1817                                          ilog2(j),
1818                                          uk.p.inode, uk.p.offset);
1819                 case BFLOAT_FAILED_PREV:
1820                         p = tree_to_prev_bkey(b, t, j);
1821                         l = is_power_of_2(j)
1822                                 ? btree_bkey_first(b, t)
1823                                 : tree_to_prev_bkey(b, t, j >> ffs(j));
1824                         r = is_power_of_2(j + 1)
1825                                 ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
1826                                 : tree_to_bkey(b, t, j >> (ffz(j) + 1));
1827
1828                         up = bkey_unpack_key(b, p);
1829                         uk = bkey_unpack_key(b, k);
1830                         bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
1831                         bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
1832
1833                         return scnprintf(buf, size,
1834                                          "    failed prev at depth %u\n"
1835                                          "\tkey starts at bit %u but first differing bit at %u\n"
1836                                          "\t%llu:%llu\n"
1837                                          "\t%llu:%llu\n"
1838                                          "\t%s\n"
1839                                          "\t%s\n",
1840                                          ilog2(j),
1841                                          bch2_bkey_greatest_differing_bit(b, l, r),
1842                                          bch2_bkey_greatest_differing_bit(b, p, k),
1843                                          uk.p.inode, uk.p.offset,
1844                                          up.p.inode, up.p.offset,
1845                                          buf1, buf2);
1846                 case BFLOAT_FAILED_OVERFLOW:
1847                         uk = bkey_unpack_key(b, k);
1848                         return scnprintf(buf, size,
1849                                          "    failed overflow at depth %u\n"
1850                                          "\t%llu:%llu\n",
1851                                          ilog2(j),
1852                                          uk.p.inode, uk.p.offset);
1853                 }
1854 out:
1855         *buf = '\0';
1856         return 0;
1857 }