]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
9a27477409bad42ff5c70e214819f11c9f896d38
[bcachefs-tools-debian] / libbcachefs / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "btree_cache.h"
10 #include "bset.h"
11 #include "eytzinger.h"
12 #include "util.h"
13
14 #include <asm/unaligned.h>
15 #include <linux/dynamic_fault.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcachefs.h>
23
24 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
25 {
26         struct bset_tree *t;
27
28         for_each_bset(b, t)
29                 if (k >= btree_bkey_first(b, t) &&
30                     k < btree_bkey_last(b, t))
31                         return t;
32
33         BUG();
34 }
35
36 /*
37  * There are never duplicate live keys in the btree - but including keys that
38  * have been flagged as deleted (and will be cleaned up later) we _will_ see
39  * duplicates.
40  *
41  * Thus the sort order is: usual key comparison first, but for keys that compare
42  * equal the deleted key(s) come first, and the (at most one) live version comes
43  * last.
44  *
45  * The main reason for this is insertion: to handle overwrites, we first iterate
46  * over keys that compare equal to our insert key, and then insert immediately
47  * prior to the first key greater than the key we're inserting - our insert
48  * position will be after all keys that compare equal to our insert key, which
49  * by the time we actually do the insert will all be deleted.
50  */
51
52 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
53 {
54         struct bkey_packed *_k, *_n;
55         struct bkey k, n;
56         char buf[120];
57
58         if (!i->u64s)
59                 return;
60
61         for (_k = i->start, k = bkey_unpack_key(b, _k);
62              _k < vstruct_last(i);
63              _k = _n, k = n) {
64                 _n = bkey_next(_k);
65
66                 bch2_bkey_to_text(buf, sizeof(buf), &k);
67                 printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
68                        _k->_data - i->_data, i->u64s, buf);
69
70                 if (_n == vstruct_last(i))
71                         continue;
72
73                 n = bkey_unpack_key(b, _n);
74
75                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
76                         printk(KERN_ERR "Key skipped backwards\n");
77                         continue;
78                 }
79
80                 /*
81                  * Weird check for duplicate non extent keys: extents are
82                  * deleted iff they have 0 size, so if it has zero size and it's
83                  * not deleted these aren't extents:
84                  */
85                 if (((!k.size && !bkey_deleted(&k)) ||
86                      (!n.size && !bkey_deleted(&n))) &&
87                     !bkey_deleted(&k) &&
88                     !bkey_cmp(n.p, k.p))
89                         printk(KERN_ERR "Duplicate keys\n");
90         }
91 }
92
93 void bch2_dump_btree_node(struct btree *b)
94 {
95         struct bset_tree *t;
96
97         console_lock();
98         for_each_bset(b, t)
99                 bch2_dump_bset(b, bset(b, t), t - b->set);
100         console_unlock();
101 }
102
103 void bch2_dump_btree_node_iter(struct btree *b,
104                               struct btree_node_iter *iter)
105 {
106         struct btree_node_iter_set *set;
107
108         printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
109
110         btree_node_iter_for_each(iter, set) {
111                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
112                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
113                 struct bkey uk = bkey_unpack_key(b, k);
114                 char buf[100];
115
116                 bch2_bkey_to_text(buf, sizeof(buf), &uk);
117                 printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
118                        k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
119         }
120 }
121
122 #ifdef CONFIG_BCACHEFS_DEBUG
123
124 static bool keys_out_of_order(struct btree *b,
125                               const struct bkey_packed *prev,
126                               const struct bkey_packed *next,
127                               bool is_extents)
128 {
129         struct bkey nextu = bkey_unpack_key(b, next);
130
131         return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
132                 ((is_extents
133                   ? !bkey_deleted(next)
134                   : !bkey_deleted(prev)) &&
135                  !bkey_cmp_packed(b, prev, next));
136 }
137
138 void __bch2_verify_btree_nr_keys(struct btree *b)
139 {
140         struct bset_tree *t;
141         struct bkey_packed *k;
142         struct btree_nr_keys nr = { 0 };
143
144         for_each_bset(b, t)
145                 for (k = btree_bkey_first(b, t);
146                      k != btree_bkey_last(b, t);
147                      k = bkey_next(k))
148                         if (!bkey_whiteout(k))
149                                 btree_keys_account_key_add(&nr, t - b->set, k);
150
151         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
152 }
153
154 static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
155                                            struct btree *b,
156                                            struct bkey_packed *k)
157 {
158         const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
159
160         bkey_unpack_key(b, k);
161
162         if (n &&
163             keys_out_of_order(b, k, n, iter->is_extents)) {
164                 struct bkey ku = bkey_unpack_key(b, k);
165                 struct bkey nu = bkey_unpack_key(b, n);
166                 char buf1[80], buf2[80];
167
168                 bch2_dump_btree_node(b);
169                 bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
170                 bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
171                 panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
172         }
173 }
174
175 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
176                                 struct btree *b)
177 {
178         struct btree_node_iter_set *set, *prev = NULL;
179         struct bset_tree *t;
180         struct bkey_packed *k, *first;
181
182         if (bch2_btree_node_iter_end(iter))
183                 return;
184
185         btree_node_iter_for_each(iter, set) {
186                 k = __btree_node_offset_to_key(b, set->k);
187                 t = bch2_bkey_to_bset(b, k);
188
189                 BUG_ON(__btree_node_offset_to_key(b, set->end) !=
190                        btree_bkey_last(b, t));
191
192                 BUG_ON(prev &&
193                        btree_node_iter_cmp(iter, b, *prev, *set) > 0);
194
195                 prev = set;
196         }
197
198         first = __btree_node_offset_to_key(b, iter->data[0].k);
199
200         for_each_bset(b, t)
201                 if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
202                     btree_bkey_last(b, t) &&
203                     (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
204                         BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
205                                                      k, first) > 0);
206 }
207
208 void bch2_verify_key_order(struct btree *b,
209                           struct btree_node_iter *iter,
210                           struct bkey_packed *where)
211 {
212         struct bset_tree *t = bch2_bkey_to_bset(b, where);
213         struct bkey_packed *k, *prev;
214         struct bkey uk, uw = bkey_unpack_key(b, where);
215
216         k = bch2_bkey_prev_all(b, t, where);
217         if (k &&
218             keys_out_of_order(b, k, where, iter->is_extents)) {
219                 char buf1[100], buf2[100];
220
221                 bch2_dump_btree_node(b);
222                 uk = bkey_unpack_key(b, k);
223                 bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
224                 bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
225                 panic("out of order with prev:\n%s\n%s\n",
226                       buf1, buf2);
227         }
228
229         k = bkey_next(where);
230         BUG_ON(k != btree_bkey_last(b, t) &&
231                keys_out_of_order(b, where, k, iter->is_extents));
232
233         for_each_bset(b, t) {
234                 if (where >= btree_bkey_first(b, t) ||
235                     where < btree_bkey_last(b, t))
236                         continue;
237
238                 k = bch2_btree_node_iter_bset_pos(iter, b, t);
239
240                 if (k == btree_bkey_last(b, t))
241                         k = bch2_bkey_prev_all(b, t, k);
242
243                 while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
244                        (prev = bch2_bkey_prev_all(b, t, k)))
245                         k = prev;
246
247                 for (;
248                      k != btree_bkey_last(b, t);
249                      k = bkey_next(k)) {
250                         uk = bkey_unpack_key(b, k);
251
252                         if (iter->is_extents) {
253                                 BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
254                                          bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
255                         } else {
256                                 BUG_ON(!bkey_cmp(uw.p, uk.p) &&
257                                        !bkey_deleted(&uk));
258                         }
259
260                         if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
261                                 break;
262                 }
263         }
264 }
265
266 #else
267
268 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
269                                                    struct btree *b,
270                                                    struct bkey_packed *k) {}
271
272 #endif
273
274 /* Auxiliary search trees */
275
276 #define BFLOAT_FAILED_UNPACKED  (U8_MAX - 0)
277 #define BFLOAT_FAILED_PREV      (U8_MAX - 1)
278 #define BFLOAT_FAILED_OVERFLOW  (U8_MAX - 2)
279 #define BFLOAT_FAILED           (U8_MAX - 2)
280
281 #define KEY_WORDS               BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
282
283 struct bkey_float {
284         u8              exponent;
285         u8              key_offset;
286         union {
287                 u32     mantissa32;
288         struct {
289                 u16     mantissa16;
290                 u16     _pad;
291         };
292         };
293 } __packed;
294
295 #define BFLOAT_32BIT_NR         32U
296
297 static unsigned bkey_float_byte_offset(unsigned idx)
298 {
299         int d = (idx - BFLOAT_32BIT_NR) << 1;
300
301         d &= ~(d >> 31);
302
303         return idx * 6 - d;
304 }
305
306 struct ro_aux_tree {
307         struct bkey_float       _d[0];
308 };
309
310 struct rw_aux_tree {
311         u16             offset;
312         struct bpos     k;
313 };
314
315 /*
316  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
317  * it used to be 64, but I realized the lookup code would touch slightly less
318  * memory if it was 128.
319  *
320  * It definites the number of bytes (in struct bset) per struct bkey_float in
321  * the auxiliar search tree - when we're done searching the bset_float tree we
322  * have this many bytes left that we do a linear search over.
323  *
324  * Since (after level 5) every level of the bset_tree is on a new cacheline,
325  * we're touching one fewer cacheline in the bset tree in exchange for one more
326  * cacheline in the linear search - but the linear search might stop before it
327  * gets to the second cacheline.
328  */
329
330 #define BSET_CACHELINE          128
331
332 /* Space required for the btree node keys */
333 static inline size_t btree_keys_bytes(struct btree *b)
334 {
335         return PAGE_SIZE << b->page_order;
336 }
337
338 static inline size_t btree_keys_cachelines(struct btree *b)
339 {
340         return btree_keys_bytes(b) / BSET_CACHELINE;
341 }
342
343 static inline size_t btree_aux_data_bytes(struct btree *b)
344 {
345         return btree_keys_cachelines(b) * 8;
346 }
347
348 static inline size_t btree_aux_data_u64s(struct btree *b)
349 {
350         return btree_aux_data_bytes(b) / sizeof(u64);
351 }
352
353 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
354 {
355         BUG_ON(t->aux_data_offset == U16_MAX);
356
357         switch (bset_aux_tree_type(t)) {
358         case BSET_NO_AUX_TREE:
359                 return t->aux_data_offset;
360         case BSET_RO_AUX_TREE:
361                 return t->aux_data_offset +
362                         DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
363                                      sizeof(u8) * t->size, 8);
364         case BSET_RW_AUX_TREE:
365                 return t->aux_data_offset +
366                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
367         default:
368                 BUG();
369         }
370 }
371
372 static unsigned bset_aux_tree_buf_start(const struct btree *b,
373                                         const struct bset_tree *t)
374 {
375         return t == b->set
376                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
377                 : bset_aux_tree_buf_end(t - 1);
378 }
379
380 static void *__aux_tree_base(const struct btree *b,
381                              const struct bset_tree *t)
382 {
383         return b->aux_data + t->aux_data_offset * 8;
384 }
385
386 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
387                                             const struct bset_tree *t)
388 {
389         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
390
391         return __aux_tree_base(b, t);
392 }
393
394 static u8 *ro_aux_tree_prev(const struct btree *b,
395                             const struct bset_tree *t)
396 {
397         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
398
399         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
400 }
401
402 static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
403                                          unsigned idx)
404 {
405         return (void *) b + bkey_float_byte_offset(idx);
406 }
407
408 static struct bkey_float *bkey_float(const struct btree *b,
409                                      const struct bset_tree *t,
410                                      unsigned idx)
411 {
412         return bkey_float_get(ro_aux_tree_base(b, t), idx);
413 }
414
415 static void bset_aux_tree_verify(struct btree *b)
416 {
417 #ifdef CONFIG_BCACHEFS_DEBUG
418         struct bset_tree *t;
419
420         for_each_bset(b, t) {
421                 if (t->aux_data_offset == U16_MAX)
422                         continue;
423
424                 BUG_ON(t != b->set &&
425                        t[-1].aux_data_offset == U16_MAX);
426
427                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
428                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
429                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
430         }
431 #endif
432 }
433
434 /* Memory allocation */
435
436 void bch2_btree_keys_free(struct btree *b)
437 {
438         vfree(b->aux_data);
439         b->aux_data = NULL;
440 }
441
442 #ifndef PAGE_KERNEL_EXEC
443 # define PAGE_KERNEL_EXEC PAGE_KERNEL
444 #endif
445
446 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
447 {
448         b->page_order   = page_order;
449         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
450                                     PAGE_KERNEL_EXEC);
451         if (!b->aux_data)
452                 return -ENOMEM;
453
454         return 0;
455 }
456
457 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
458 {
459         unsigned i;
460
461         b->nsets                = 0;
462         memset(&b->nr, 0, sizeof(b->nr));
463 #ifdef CONFIG_BCACHEFS_DEBUG
464         b->expensive_debug_checks = expensive_debug_checks;
465 #endif
466         for (i = 0; i < MAX_BSETS; i++)
467                 b->set[i].data_offset = U16_MAX;
468
469         bch2_bset_set_no_aux_tree(b, b->set);
470 }
471
472 /* Binary tree stuff for auxiliary search trees */
473
474 /*
475  * Cacheline/offset <-> bkey pointer arithmetic:
476  *
477  * t->tree is a binary search tree in an array; each node corresponds to a key
478  * in one cacheline in t->set (BSET_CACHELINE bytes).
479  *
480  * This means we don't have to store the full index of the key that a node in
481  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
482  * then bkey_float->m gives us the offset within that cacheline, in units of 8
483  * bytes.
484  *
485  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
486  * make this work.
487  *
488  * To construct the bfloat for an arbitrary key we need to know what the key
489  * immediately preceding it is: we have to check if the two keys differ in the
490  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
491  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
492  */
493
494 static inline void *bset_cacheline(const struct btree *b,
495                                    const struct bset_tree *t,
496                                    unsigned cacheline)
497 {
498         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
499                                    L1_CACHE_BYTES) +
500                 cacheline * BSET_CACHELINE;
501 }
502
503 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
504                                              const struct bset_tree *t,
505                                              unsigned cacheline,
506                                              unsigned offset)
507 {
508         return bset_cacheline(b, t, cacheline) + offset * 8;
509 }
510
511 static unsigned bkey_to_cacheline(const struct btree *b,
512                                   const struct bset_tree *t,
513                                   const struct bkey_packed *k)
514 {
515         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
516 }
517
518 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
519                                           const struct bset_tree *t,
520                                           unsigned cacheline,
521                                           const struct bkey_packed *k)
522 {
523         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
524 }
525
526 static unsigned bkey_to_cacheline_offset(const struct btree *b,
527                                          const struct bset_tree *t,
528                                          unsigned cacheline,
529                                          const struct bkey_packed *k)
530 {
531         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
532
533         EBUG_ON(m > U8_MAX);
534         return m;
535 }
536
537 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
538                                                const struct bset_tree *t,
539                                                unsigned j)
540 {
541         return cacheline_to_bkey(b, t,
542                         __eytzinger1_to_inorder(j, t->size, t->extra),
543                         bkey_float(b, t, j)->key_offset);
544 }
545
546 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
547                                              const struct bset_tree *t,
548                                              unsigned j)
549 {
550         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
551
552         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
553 }
554
555 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
556                                        const struct bset_tree *t)
557 {
558         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
559
560         return __aux_tree_base(b, t);
561 }
562
563 /*
564  * For the write set - the one we're currently inserting keys into - we don't
565  * maintain a full search tree, we just keep a simple lookup table in t->prev.
566  */
567 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
568                                           struct bset_tree *t,
569                                           unsigned j)
570 {
571         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
572 }
573
574 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
575                             unsigned j, struct bkey_packed *k)
576 {
577         EBUG_ON(k >= btree_bkey_last(b, t));
578
579         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
580                 .offset = __btree_node_key_to_offset(b, k),
581                 .k      = bkey_unpack_pos(b, k),
582         };
583 }
584
585 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
586                                         struct bset_tree *t)
587 {
588         struct bkey_packed *k = btree_bkey_first(b, t);
589         unsigned j = 0;
590
591         if (!btree_keys_expensive_checks(b))
592                 return;
593
594         BUG_ON(bset_has_ro_aux_tree(t));
595
596         if (!bset_has_rw_aux_tree(t))
597                 return;
598
599         BUG_ON(t->size < 1);
600         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
601
602         goto start;
603         while (1) {
604                 if (rw_aux_to_bkey(b, t, j) == k) {
605                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
606                                         bkey_unpack_pos(b, k)));
607 start:
608                         if (++j == t->size)
609                                 break;
610
611                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
612                                rw_aux_tree(b, t)[j - 1].offset);
613                 }
614
615                 k = bkey_next(k);
616                 BUG_ON(k >= btree_bkey_last(b, t));
617         }
618 }
619
620 /* returns idx of first entry >= offset: */
621 static unsigned rw_aux_tree_bsearch(struct btree *b,
622                                     struct bset_tree *t,
623                                     unsigned offset)
624 {
625         unsigned l = 0, r = t->size;
626
627         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
628
629         while (l < r) {
630                 unsigned m = (l + r) >> 1;
631
632                 if (rw_aux_tree(b, t)[m].offset < offset)
633                         l = m + 1;
634                 else
635                         r = m;
636         }
637
638         EBUG_ON(l < t->size &&
639                 rw_aux_tree(b, t)[l].offset < offset);
640         EBUG_ON(l &&
641                 rw_aux_tree(b, t)[l - 1].offset >= offset);
642
643         EBUG_ON(l > r);
644         EBUG_ON(l > t->size);
645
646         return l;
647 }
648
649 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
650                                        unsigned idx)
651 {
652         return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
653 }
654
655 static inline void bfloat_mantissa_set(struct bkey_float *f,
656                                        unsigned idx, unsigned mantissa)
657 {
658         if (idx < BFLOAT_32BIT_NR)
659                 f->mantissa32 = mantissa;
660         else
661                 f->mantissa16 = mantissa;
662 }
663
664 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
665                                      const struct bkey_float *f,
666                                      unsigned idx)
667 {
668         u64 v;
669
670         EBUG_ON(!bkey_packed(k));
671
672         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
673
674         /*
675          * In little endian, we're shifting off low bits (and then the bits we
676          * want are at the low end), in big endian we're shifting off high bits
677          * (and then the bits we want are at the high end, so we shift them
678          * back down):
679          */
680 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
681         v >>= f->exponent & 7;
682 #else
683         v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
684 #endif
685         return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
686 }
687
688 static void make_bfloat(struct btree *b, struct bset_tree *t,
689                         unsigned j,
690                         struct bkey_packed *min_key,
691                         struct bkey_packed *max_key)
692 {
693         struct bkey_float *f = bkey_float(b, t, j);
694         struct bkey_packed *m = tree_to_bkey(b, t, j);
695         struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
696         struct bkey_packed *l, *r;
697         unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
698         unsigned mantissa;
699         int shift, exponent, high_bit;
700
701         EBUG_ON(bkey_next(p) != m);
702
703         if (is_power_of_2(j)) {
704                 l = min_key;
705
706                 if (!l->u64s) {
707                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
708                                 struct bkey_i tmp;
709
710                                 bkey_init(&tmp.k);
711                                 tmp.k.p = b->data->min_key;
712                                 bkey_copy(l, &tmp);
713                         }
714                 }
715         } else {
716                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
717
718                 EBUG_ON(m < l);
719         }
720
721         if (is_power_of_2(j + 1)) {
722                 r = max_key;
723
724                 if (!r->u64s) {
725                         if (!bkey_pack_pos(r, t->max_key, b)) {
726                                 struct bkey_i tmp;
727
728                                 bkey_init(&tmp.k);
729                                 tmp.k.p = t->max_key;
730                                 bkey_copy(r, &tmp);
731                         }
732                 }
733         } else {
734                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
735
736                 EBUG_ON(m > r);
737         }
738
739         /*
740          * for failed bfloats, the lookup code falls back to comparing against
741          * the original key.
742          */
743
744         if (!bkey_packed(l) || !bkey_packed(r) ||
745             !bkey_packed(p) || !bkey_packed(m) ||
746             !b->nr_key_bits) {
747                 f->exponent = BFLOAT_FAILED_UNPACKED;
748                 return;
749         }
750
751         /*
752          * The greatest differing bit of l and r is the first bit we must
753          * include in the bfloat mantissa we're creating in order to do
754          * comparisons - that bit always becomes the high bit of
755          * bfloat->mantissa, and thus the exponent we're calculating here is
756          * the position of what will become the low bit in bfloat->mantissa:
757          *
758          * Note that this may be negative - we may be running off the low end
759          * of the key: we handle this later:
760          */
761         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
762                        min_t(unsigned, bits, b->nr_key_bits) - 1);
763         exponent = high_bit - (bits - 1);
764
765         /*
766          * Then we calculate the actual shift value, from the start of the key
767          * (k->_data), to get the key bits starting at exponent:
768          */
769 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
770         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
771
772         EBUG_ON(shift + bits > b->format.key_u64s * 64);
773 #else
774         shift = high_bit_offset +
775                 b->nr_key_bits -
776                 exponent -
777                 bits;
778
779         EBUG_ON(shift < KEY_PACKED_BITS_START);
780 #endif
781         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
782
783         f->exponent = shift;
784         mantissa = bkey_mantissa(m, f, j);
785
786         /*
787          * If we've got garbage bits, set them to all 1s - it's legal for the
788          * bfloat to compare larger than the original key, but not smaller:
789          */
790         if (exponent < 0)
791                 mantissa |= ~(~0U << -exponent);
792
793         bfloat_mantissa_set(f, j, mantissa);
794
795         /*
796          * The bfloat must be able to tell its key apart from the previous key -
797          * if its key and the previous key don't differ in the required bits,
798          * flag as failed - unless the keys are actually equal, in which case
799          * we aren't required to return a specific one:
800          */
801         if (exponent > 0 &&
802             bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
803             bkey_cmp_packed(b, p, m)) {
804                 f->exponent = BFLOAT_FAILED_PREV;
805                 return;
806         }
807
808         /*
809          * f->mantissa must compare >= the original key - for transitivity with
810          * the comparison in bset_search_tree. If we're dropping set bits,
811          * increment it:
812          */
813         if (exponent > (int) bch2_bkey_ffs(b, m)) {
814                 if (j < BFLOAT_32BIT_NR
815                     ? f->mantissa32 == U32_MAX
816                     : f->mantissa16 == U16_MAX)
817                         f->exponent = BFLOAT_FAILED_OVERFLOW;
818
819                 if (j < BFLOAT_32BIT_NR)
820                         f->mantissa32++;
821                 else
822                         f->mantissa16++;
823         }
824 }
825
826 /* bytes remaining - only valid for last bset: */
827 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
828 {
829         bset_aux_tree_verify(b);
830
831         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
832 }
833
834 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
835 {
836         unsigned bytes = __bset_tree_capacity(b, t);
837
838         if (bytes < 7 * BFLOAT_32BIT_NR)
839                 return bytes / 7;
840
841         bytes -= 7 * BFLOAT_32BIT_NR;
842
843         return BFLOAT_32BIT_NR + bytes / 5;
844 }
845
846 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
847 {
848         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
849 }
850
851 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
852 {
853         struct bkey_packed *k;
854
855         t->size = 1;
856         t->extra = BSET_RW_AUX_TREE_VAL;
857         rw_aux_tree(b, t)[0].offset =
858                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
859
860         for (k = btree_bkey_first(b, t);
861              k != btree_bkey_last(b, t);
862              k = bkey_next(k)) {
863                 if (t->size == bset_rw_tree_capacity(b, t))
864                         break;
865
866                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
867                     L1_CACHE_BYTES)
868                         rw_aux_tree_set(b, t, t->size++, k);
869         }
870 }
871
872 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
873 {
874         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
875         struct bkey_packed min_key, max_key;
876         unsigned j, cacheline = 1;
877
878         /* signal to make_bfloat() that they're uninitialized: */
879         min_key.u64s = max_key.u64s = 0;
880
881         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
882                       bset_ro_tree_capacity(b, t));
883 retry:
884         if (t->size < 2) {
885                 t->size = 0;
886                 t->extra = BSET_NO_AUX_TREE_VAL;
887                 return;
888         }
889
890         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
891
892         /* First we figure out where the first key in each cacheline is */
893         eytzinger1_for_each(j, t->size) {
894                 while (bkey_to_cacheline(b, t, k) < cacheline)
895                         prev = k, k = bkey_next(k);
896
897                 if (k >= btree_bkey_last(b, t)) {
898                         /* XXX: this path sucks */
899                         t->size--;
900                         goto retry;
901                 }
902
903                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
904                 bkey_float(b, t, j)->key_offset =
905                         bkey_to_cacheline_offset(b, t, cacheline++, k);
906
907                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
908                 EBUG_ON(tree_to_bkey(b, t, j) != k);
909         }
910
911         while (bkey_next(k) != btree_bkey_last(b, t))
912                 k = bkey_next(k);
913
914         t->max_key = bkey_unpack_pos(b, k);
915
916         /* Then we build the tree */
917         eytzinger1_for_each(j, t->size)
918                 make_bfloat(b, t, j, &min_key, &max_key);
919 }
920
921 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
922 {
923         struct bset_tree *i;
924
925         for (i = b->set; i != t; i++)
926                 BUG_ON(bset_has_rw_aux_tree(i));
927
928         bch2_bset_set_no_aux_tree(b, t);
929
930         /* round up to next cacheline: */
931         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
932                                       SMP_CACHE_BYTES / sizeof(u64));
933
934         bset_aux_tree_verify(b);
935 }
936
937 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
938                              bool writeable)
939 {
940         if (writeable
941             ? bset_has_rw_aux_tree(t)
942             : bset_has_ro_aux_tree(t))
943                 return;
944
945         bset_alloc_tree(b, t);
946
947         if (!__bset_tree_capacity(b, t))
948                 return;
949
950         if (writeable)
951                 __build_rw_aux_tree(b, t);
952         else
953                 __build_ro_aux_tree(b, t);
954
955         bset_aux_tree_verify(b);
956 }
957
958 void bch2_bset_init_first(struct btree *b, struct bset *i)
959 {
960         struct bset_tree *t;
961
962         BUG_ON(b->nsets);
963
964         memset(i, 0, sizeof(*i));
965         get_random_bytes(&i->seq, sizeof(i->seq));
966         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
967
968         t = &b->set[b->nsets++];
969         set_btree_bset(b, t, i);
970 }
971
972 void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
973                          struct btree_node_entry *bne)
974 {
975         struct bset *i = &bne->keys;
976         struct bset_tree *t;
977
978         BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
979         BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
980         BUG_ON(b->nsets >= MAX_BSETS);
981
982         memset(i, 0, sizeof(*i));
983         i->seq = btree_bset_first(b)->seq;
984         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
985
986         t = &b->set[b->nsets++];
987         set_btree_bset(b, t, i);
988 }
989
990 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
991                                        struct bkey_packed *k)
992 {
993         struct bkey_packed *p;
994         unsigned offset;
995         int j;
996
997         EBUG_ON(k < btree_bkey_first(b, t) ||
998                 k > btree_bkey_last(b, t));
999
1000         if (k == btree_bkey_first(b, t))
1001                 return NULL;
1002
1003         switch (bset_aux_tree_type(t)) {
1004         case BSET_NO_AUX_TREE:
1005                 p = btree_bkey_first(b, t);
1006                 break;
1007         case BSET_RO_AUX_TREE:
1008                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
1009
1010                 do {
1011                         p = j ? tree_to_bkey(b, t,
1012                                         __inorder_to_eytzinger1(j--,
1013                                                         t->size, t->extra))
1014                               : btree_bkey_first(b, t);
1015                 } while (p >= k);
1016                 break;
1017         case BSET_RW_AUX_TREE:
1018                 offset = __btree_node_key_to_offset(b, k);
1019                 j = rw_aux_tree_bsearch(b, t, offset);
1020                 p = j ? rw_aux_to_bkey(b, t, j - 1)
1021                       : btree_bkey_first(b, t);
1022                 break;
1023         }
1024
1025         return p;
1026 }
1027
1028 struct bkey_packed *bch2_bkey_prev_all(struct btree *b, struct bset_tree *t,
1029                                        struct bkey_packed *k)
1030 {
1031         struct bkey_packed *p;
1032
1033         p = __bkey_prev(b, t, k);
1034         if (!p)
1035                 return NULL;
1036
1037         while (bkey_next(p) != k)
1038                 p = bkey_next(p);
1039
1040         return p;
1041 }
1042
1043 struct bkey_packed *bch2_bkey_prev(struct btree *b, struct bset_tree *t,
1044                                    struct bkey_packed *k)
1045 {
1046         while (1) {
1047                 struct bkey_packed *p, *i, *ret = NULL;
1048
1049                 p = __bkey_prev(b, t, k);
1050                 if (!p)
1051                         return NULL;
1052
1053                 for (i = p; i != k; i = bkey_next(i))
1054                         if (!bkey_deleted(i))
1055                                 ret = i;
1056
1057                 if (ret)
1058                         return ret;
1059
1060                 k = p;
1061         }
1062 }
1063
1064 /* Insert */
1065
1066 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1067                                             struct bset_tree *t,
1068                                             struct bkey_packed *k)
1069 {
1070         unsigned offset = __btree_node_key_to_offset(b, k);
1071         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1072
1073         if (j < t->size &&
1074             rw_aux_tree(b, t)[j].offset == offset)
1075                 rw_aux_tree_set(b, t, j, k);
1076
1077         bch2_bset_verify_rw_aux_tree(b, t);
1078 }
1079
1080 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1081                                             struct bset_tree *t,
1082                                             struct bkey_packed *k)
1083 {
1084         struct bkey_packed min_key, max_key;
1085         unsigned inorder, j;
1086
1087         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1088
1089         /* signal to make_bfloat() that they're uninitialized: */
1090         min_key.u64s = max_key.u64s = 0;
1091
1092         if (bkey_next(k) == btree_bkey_last(b, t)) {
1093                 t->max_key = bkey_unpack_pos(b, k);
1094
1095                 for (j = 1; j < t->size; j = j * 2 + 1)
1096                         make_bfloat(b, t, j, &min_key, &max_key);
1097         }
1098
1099         inorder = bkey_to_cacheline(b, t, k);
1100
1101         if (inorder &&
1102             inorder < t->size) {
1103                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1104
1105                 if (k == tree_to_bkey(b, t, j)) {
1106                         /* Fix the node this key corresponds to */
1107                         make_bfloat(b, t, j, &min_key, &max_key);
1108
1109                         /* Children for which this key is the right boundary */
1110                         for (j = eytzinger1_left_child(j);
1111                              j < t->size;
1112                              j = eytzinger1_right_child(j))
1113                                 make_bfloat(b, t, j, &min_key, &max_key);
1114                 }
1115         }
1116
1117         if (inorder + 1 < t->size) {
1118                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1119
1120                 if (k == tree_to_prev_bkey(b, t, j)) {
1121                         make_bfloat(b, t, j, &min_key, &max_key);
1122
1123                         /* Children for which this key is the left boundary */
1124                         for (j = eytzinger1_right_child(j);
1125                              j < t->size;
1126                              j = eytzinger1_left_child(j))
1127                                 make_bfloat(b, t, j, &min_key, &max_key);
1128                 }
1129         }
1130 }
1131
1132 /**
1133  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1134  * modified, fix any auxiliary search tree by remaking all the nodes in the
1135  * auxiliary search tree that @k corresponds to
1136  */
1137 void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
1138                                    struct bkey_packed *k)
1139 {
1140         switch (bset_aux_tree_type(t)) {
1141         case BSET_NO_AUX_TREE:
1142                 break;
1143         case BSET_RO_AUX_TREE:
1144                 ro_aux_tree_fix_invalidated_key(b, t, k);
1145                 break;
1146         case BSET_RW_AUX_TREE:
1147                 rw_aux_tree_fix_invalidated_key(b, t, k);
1148                 break;
1149         }
1150 }
1151
1152 static void bch2_bset_fix_lookup_table(struct btree *b,
1153                                        struct bset_tree *t,
1154                                        struct bkey_packed *_where,
1155                                        unsigned clobber_u64s,
1156                                        unsigned new_u64s)
1157 {
1158         int shift = new_u64s - clobber_u64s;
1159         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1160
1161         EBUG_ON(bset_has_ro_aux_tree(t));
1162
1163         if (!bset_has_rw_aux_tree(t))
1164                 return;
1165
1166         l = rw_aux_tree_bsearch(b, t, where);
1167
1168         /* l is first >= than @where */
1169
1170         EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
1171         EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
1172
1173         if (!l) /* never delete first entry */
1174                 l++;
1175         else if (l < t->size &&
1176                  where < t->end_offset &&
1177                  rw_aux_tree(b, t)[l].offset == where)
1178                 rw_aux_tree_set(b, t, l++, _where);
1179
1180         /* l now > where */
1181
1182         for (j = l;
1183              j < t->size &&
1184              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1185              j++)
1186                 ;
1187
1188         if (j < t->size &&
1189             rw_aux_tree(b, t)[j].offset + shift ==
1190             rw_aux_tree(b, t)[l - 1].offset)
1191                 j++;
1192
1193         memmove(&rw_aux_tree(b, t)[l],
1194                 &rw_aux_tree(b, t)[j],
1195                 (void *) &rw_aux_tree(b, t)[t->size] -
1196                 (void *) &rw_aux_tree(b, t)[j]);
1197         t->size -= j - l;
1198
1199         for (j = l; j < t->size; j++)
1200                rw_aux_tree(b, t)[j].offset += shift;
1201
1202         EBUG_ON(l < t->size &&
1203                 rw_aux_tree(b, t)[l].offset ==
1204                 rw_aux_tree(b, t)[l - 1].offset);
1205
1206         if (t->size < bset_rw_tree_capacity(b, t) &&
1207             (l < t->size
1208              ? rw_aux_tree(b, t)[l].offset
1209              : t->end_offset) -
1210             rw_aux_tree(b, t)[l - 1].offset >
1211             L1_CACHE_BYTES / sizeof(u64)) {
1212                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1213                 struct bkey_packed *end = l < t->size
1214                         ? rw_aux_to_bkey(b, t, l)
1215                         : btree_bkey_last(b, t);
1216                 struct bkey_packed *k = start;
1217
1218                 while (1) {
1219                         k = bkey_next(k);
1220                         if (k == end)
1221                                 break;
1222
1223                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1224                                 memmove(&rw_aux_tree(b, t)[l + 1],
1225                                         &rw_aux_tree(b, t)[l],
1226                                         (void *) &rw_aux_tree(b, t)[t->size] -
1227                                         (void *) &rw_aux_tree(b, t)[l]);
1228                                 t->size++;
1229                                 rw_aux_tree_set(b, t, l, k);
1230                                 break;
1231                         }
1232                 }
1233         }
1234
1235         bch2_bset_verify_rw_aux_tree(b, t);
1236         bset_aux_tree_verify(b);
1237 }
1238
1239 void bch2_bset_insert(struct btree *b,
1240                       struct btree_node_iter *iter,
1241                       struct bkey_packed *where,
1242                       struct bkey_i *insert,
1243                       unsigned clobber_u64s)
1244 {
1245         struct bkey_format *f = &b->format;
1246         struct bset_tree *t = bset_tree_last(b);
1247         struct bkey_packed packed, *src = bkey_to_packed(insert);
1248
1249         bch2_bset_verify_rw_aux_tree(b, t);
1250
1251         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1252                 src = &packed;
1253
1254         if (!bkey_whiteout(&insert->k))
1255                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1256
1257         if (src->u64s != clobber_u64s) {
1258                 u64 *src_p = where->_data + clobber_u64s;
1259                 u64 *dst_p = where->_data + src->u64s;
1260
1261                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1262                         (int) clobber_u64s - src->u64s);
1263
1264                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1265                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1266                 set_btree_bset_end(b, t);
1267         }
1268
1269         memcpy_u64s(where, src,
1270                     bkeyp_key_u64s(f, src));
1271         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1272                     bkeyp_val_u64s(f, src));
1273
1274         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1275
1276         bch2_verify_key_order(b, iter, where);
1277         bch2_verify_btree_nr_keys(b);
1278 }
1279
1280 void bch2_bset_delete(struct btree *b,
1281                       struct bkey_packed *where,
1282                       unsigned clobber_u64s)
1283 {
1284         struct bset_tree *t = bset_tree_last(b);
1285         u64 *src_p = where->_data + clobber_u64s;
1286         u64 *dst_p = where->_data;
1287
1288         bch2_bset_verify_rw_aux_tree(b, t);
1289
1290         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1291
1292         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1293         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1294         set_btree_bset_end(b, t);
1295
1296         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1297 }
1298
1299 /* Lookup */
1300
1301 __flatten
1302 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1303                                 struct bset_tree *t,
1304                                 struct bpos search,
1305                                 const struct bkey_packed *packed_search)
1306 {
1307         unsigned l = 0, r = t->size;
1308
1309         while (l + 1 != r) {
1310                 unsigned m = (l + r) >> 1;
1311
1312                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
1313                         l = m;
1314                 else
1315                         r = m;
1316         }
1317
1318         return rw_aux_to_bkey(b, t, l);
1319 }
1320
1321 noinline
1322 static int bset_search_tree_slowpath(const struct btree *b,
1323                                 struct bset_tree *t, struct bpos *search,
1324                                 const struct bkey_packed *packed_search,
1325                                 unsigned n)
1326 {
1327         return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
1328                                  packed_search, search) < 0;
1329 }
1330
1331 __flatten
1332 static struct bkey_packed *bset_search_tree(const struct btree *b,
1333                                 struct bset_tree *t,
1334                                 struct bpos search,
1335                                 const struct bkey_packed *packed_search)
1336 {
1337         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1338         struct bkey_float *f = bkey_float_get(base, 1);
1339         void *p;
1340         unsigned inorder, n = 1;
1341
1342         while (1) {
1343                 if (likely(n << 4 < t->size)) {
1344                         p = bkey_float_get(base, n << 4);
1345                         prefetch(p);
1346                 } else if (n << 3 < t->size) {
1347                         inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
1348                         p = bset_cacheline(b, t, inorder);
1349 #ifdef CONFIG_X86_64
1350                         asm(".intel_syntax noprefix;"
1351                             "prefetcht0 [%0 - 127 + 64 * 0];"
1352                             "prefetcht0 [%0 - 127 + 64 * 1];"
1353                             "prefetcht0 [%0 - 127 + 64 * 2];"
1354                             "prefetcht0 [%0 - 127 + 64 * 3];"
1355                             ".att_syntax prefix;"
1356                             :
1357                             : "r" (p + 127));
1358 #else
1359                         prefetch(p + L1_CACHE_BYTES * 0);
1360                         prefetch(p + L1_CACHE_BYTES * 1);
1361                         prefetch(p + L1_CACHE_BYTES * 2);
1362                         prefetch(p + L1_CACHE_BYTES * 3);
1363 #endif
1364                 } else if (n >= t->size)
1365                         break;
1366
1367                 f = bkey_float_get(base, n);
1368
1369                 if (packed_search &&
1370                     likely(f->exponent < BFLOAT_FAILED))
1371                         n = n * 2 + (bfloat_mantissa(f, n) <
1372                                      bkey_mantissa(packed_search, f, n));
1373                 else
1374                         n = n * 2 + bset_search_tree_slowpath(b, t,
1375                                                 &search, packed_search, n);
1376         } while (n < t->size);
1377
1378         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1379
1380         /*
1381          * n would have been the node we recursed to - the low bit tells us if
1382          * we recursed left or recursed right.
1383          */
1384         if (n & 1) {
1385                 return cacheline_to_bkey(b, t, inorder, f->key_offset);
1386         } else {
1387                 if (--inorder) {
1388                         n = eytzinger1_prev(n >> 1, t->size);
1389                         f = bkey_float_get(base, n);
1390                         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1391                 } else
1392                         return btree_bkey_first(b, t);
1393         }
1394 }
1395
1396 /*
1397  * Returns the first key greater than or equal to @search
1398  */
1399 __always_inline __flatten
1400 static struct bkey_packed *bch2_bset_search(struct btree *b,
1401                                 struct bset_tree *t,
1402                                 struct bpos search,
1403                                 struct bkey_packed *packed_search,
1404                                 const struct bkey_packed *lossy_packed_search,
1405                                 bool strictly_greater)
1406 {
1407         struct bkey_packed *m;
1408
1409         /*
1410          * First, we search for a cacheline, then lastly we do a linear search
1411          * within that cacheline.
1412          *
1413          * To search for the cacheline, there's three different possibilities:
1414          *  * The set is too small to have a search tree, so we just do a linear
1415          *    search over the whole set.
1416          *  * The set is the one we're currently inserting into; keeping a full
1417          *    auxiliary search tree up to date would be too expensive, so we
1418          *    use a much simpler lookup table to do a binary search -
1419          *    bset_search_write_set().
1420          *  * Or we use the auxiliary search tree we constructed earlier -
1421          *    bset_search_tree()
1422          */
1423
1424         switch (bset_aux_tree_type(t)) {
1425         case BSET_NO_AUX_TREE:
1426                 m = btree_bkey_first(b, t);
1427                 break;
1428         case BSET_RW_AUX_TREE:
1429                 m = bset_search_write_set(b, t, search, lossy_packed_search);
1430                 break;
1431         case BSET_RO_AUX_TREE:
1432                 /*
1433                  * Each node in the auxiliary search tree covers a certain range
1434                  * of bits, and keys above and below the set it covers might
1435                  * differ outside those bits - so we have to special case the
1436                  * start and end - handle that here:
1437                  */
1438
1439                 if (bkey_cmp(search, t->max_key) > 0)
1440                         return btree_bkey_last(b, t);
1441
1442                 m = bset_search_tree(b, t, search, lossy_packed_search);
1443                 break;
1444         }
1445
1446         if (lossy_packed_search)
1447                 while (m != btree_bkey_last(b, t) &&
1448                        !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
1449                                                     m, strictly_greater))
1450                         m = bkey_next(m);
1451
1452         if (!packed_search)
1453                 while (m != btree_bkey_last(b, t) &&
1454                        !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
1455                         m = bkey_next(m);
1456
1457         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1458                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1459
1460                 BUG_ON(prev &&
1461                        btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
1462                                                    prev, strictly_greater));
1463         }
1464
1465         return m;
1466 }
1467
1468 /* Btree node iterator */
1469
1470 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1471                                struct btree *b,
1472                                const struct bkey_packed *k,
1473                                const struct bkey_packed *end)
1474 {
1475         __bch2_btree_node_iter_push(iter, b, k, end);
1476         bch2_btree_node_iter_sort(iter, b);
1477 }
1478
1479 noinline __flatten __attribute__((cold))
1480 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1481                               struct btree *b, struct bpos search,
1482                               bool strictly_greater, bool is_extents)
1483 {
1484         struct bset_tree *t;
1485
1486         trace_bkey_pack_pos_fail(search);
1487
1488         for_each_bset(b, t)
1489                 __bch2_btree_node_iter_push(iter, b,
1490                         bch2_bset_search(b, t, search, NULL, NULL,
1491                                         strictly_greater),
1492                         btree_bkey_last(b, t));
1493
1494         bch2_btree_node_iter_sort(iter, b);
1495 }
1496
1497 /**
1498  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1499  * given position
1500  *
1501  * Main entry point to the lookup code for individual btree nodes:
1502  *
1503  * NOTE:
1504  *
1505  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1506  * keys. This doesn't matter for most code, but it does matter for lookups.
1507  *
1508  * Some adjacent keys with a string of equal keys:
1509  *      i j k k k k l m
1510  *
1511  * If you search for k, the lookup code isn't guaranteed to return you any
1512  * specific k. The lookup code is conceptually doing a binary search and
1513  * iterating backwards is very expensive so if the pivot happens to land at the
1514  * last k that's what you'll get.
1515  *
1516  * This works out ok, but it's something to be aware of:
1517  *
1518  *  - For non extents, we guarantee that the live key comes last - see
1519  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1520  *    see will only be deleted keys you don't care about.
1521  *
1522  *  - For extents, deleted keys sort last (see the comment at the top of this
1523  *    file). But when you're searching for extents, you actually want the first
1524  *    key strictly greater than your search key - an extent that compares equal
1525  *    to the search key is going to have 0 sectors after the search key.
1526  *
1527  *    But this does mean that we can't just search for
1528  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1529  *    the range we want - if we're unlucky and there's an extent that ends
1530  *    exactly where we searched, then there could be a deleted key at the same
1531  *    position and we'd get that when we search instead of the preceding extent
1532  *    we needed.
1533  *
1534  *    So we've got to search for start_of_range, then after the lookup iterate
1535  *    past any extents that compare equal to the position we searched for.
1536  */
1537 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1538                                struct btree *b, struct bpos search,
1539                                bool strictly_greater, bool is_extents)
1540 {
1541         struct bset_tree *t;
1542         struct bkey_packed p, *packed_search = NULL;
1543
1544         EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
1545         bset_aux_tree_verify(b);
1546
1547         __bch2_btree_node_iter_init(iter, is_extents);
1548
1549         switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
1550         case BKEY_PACK_POS_EXACT:
1551                 packed_search = &p;
1552                 break;
1553         case BKEY_PACK_POS_SMALLER:
1554                 packed_search = NULL;
1555                 break;
1556         case BKEY_PACK_POS_FAIL:
1557                 btree_node_iter_init_pack_failed(iter, b, search,
1558                                         strictly_greater, is_extents);
1559                 return;
1560         }
1561
1562         for_each_bset(b, t)
1563                 __bch2_btree_node_iter_push(iter, b,
1564                                            bch2_bset_search(b, t, search,
1565                                                            packed_search, &p,
1566                                                            strictly_greater),
1567                                            btree_bkey_last(b, t));
1568
1569         bch2_btree_node_iter_sort(iter, b);
1570 }
1571
1572 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1573                                           struct btree *b,
1574                                           bool is_extents)
1575 {
1576         struct bset_tree *t;
1577
1578         __bch2_btree_node_iter_init(iter, is_extents);
1579
1580         for_each_bset(b, t)
1581                 __bch2_btree_node_iter_push(iter, b,
1582                                            btree_bkey_first(b, t),
1583                                            btree_bkey_last(b, t));
1584         bch2_btree_node_iter_sort(iter, b);
1585 }
1586
1587 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1588                                                   struct btree *b,
1589                                                   struct bset_tree *t)
1590 {
1591         struct btree_node_iter_set *set;
1592
1593         btree_node_iter_for_each(iter, set)
1594                 if (set->end == t->end_offset)
1595                         return __btree_node_offset_to_key(b, set->k);
1596
1597         return btree_bkey_last(b, t);
1598 }
1599
1600 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1601                                             struct btree *b,
1602                                             unsigned first)
1603 {
1604         bool ret;
1605
1606         if ((ret = (btree_node_iter_cmp(iter, b,
1607                                         iter->data[first],
1608                                         iter->data[first + 1]) > 0)))
1609                 swap(iter->data[first], iter->data[first + 1]);
1610         return ret;
1611 }
1612
1613 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1614                                struct btree *b)
1615 {
1616         /* unrolled bubble sort: */
1617
1618         if (!__btree_node_iter_set_end(iter, 2)) {
1619                 btree_node_iter_sort_two(iter, b, 0);
1620                 btree_node_iter_sort_two(iter, b, 1);
1621         }
1622
1623         if (!__btree_node_iter_set_end(iter, 1))
1624                 btree_node_iter_sort_two(iter, b, 0);
1625 }
1626
1627 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1628                                    struct btree_node_iter_set *set)
1629 {
1630         struct btree_node_iter_set *last =
1631                 iter->data + ARRAY_SIZE(iter->data) - 1;
1632
1633         memmove(&set[0], &set[1], (void *) last - (void *) set);
1634         *last = (struct btree_node_iter_set) { 0, 0 };
1635 }
1636
1637 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1638                                                   struct btree *b)
1639 {
1640         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1641
1642         EBUG_ON(iter->data->k > iter->data->end);
1643
1644         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1645                 bch2_btree_node_iter_set_drop(iter, iter->data);
1646                 return;
1647         }
1648
1649         if (__btree_node_iter_set_end(iter, 1))
1650                 return;
1651
1652         if (!btree_node_iter_sort_two(iter, b, 0))
1653                 return;
1654
1655         if (__btree_node_iter_set_end(iter, 2))
1656                 return;
1657
1658         btree_node_iter_sort_two(iter, b, 1);
1659 }
1660
1661 /**
1662  * bch_btree_node_iter_advance - advance @iter by one key
1663  *
1664  * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
1665  * momentarily have out of order extents.
1666  */
1667 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1668                                   struct btree *b)
1669 {
1670 #ifdef CONFIG_BCACHEFS_DEBUG
1671         struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
1672
1673         __bch2_btree_node_iter_advance(iter, b);
1674         bch2_btree_node_iter_next_check(iter, b, k);
1675 #else
1676         __bch2_btree_node_iter_advance(iter, b);
1677 #endif
1678 }
1679
1680 static inline bool __btree_node_iter_used(struct btree_node_iter *iter)
1681 {
1682         unsigned n = ARRAY_SIZE(iter->data);
1683
1684         while (n && __btree_node_iter_set_end(iter, n - 1))
1685                 --n;
1686
1687         return n;
1688 }
1689
1690 /*
1691  * Expensive:
1692  */
1693 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
1694                                                   struct btree *b)
1695 {
1696         struct bkey_packed *k, *prev = NULL;
1697         struct btree_node_iter_set *set;
1698         struct bset_tree *t;
1699         struct bset_tree *prev_t;
1700         unsigned end, used;
1701
1702         bch2_btree_node_iter_verify(iter, b);
1703
1704         for_each_bset(b, t) {
1705                 k = bch2_bkey_prev_all(b, t,
1706                         bch2_btree_node_iter_bset_pos(iter, b, t));
1707                 if (k &&
1708                     (!prev || __btree_node_iter_cmp(iter->is_extents, b,
1709                                                     k, prev) > 0)) {
1710                         prev = k;
1711                         prev_t = t;
1712                 }
1713         }
1714
1715         if (!prev)
1716                 return NULL;
1717
1718         /*
1719          * We're manually memmoving instead of just calling sort() to ensure the
1720          * prev we picked ends up in slot 0 - sort won't necessarily put it
1721          * there because of duplicate deleted keys:
1722          */
1723         end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
1724         btree_node_iter_for_each(iter, set)
1725                 if (set->end == end) {
1726                         memmove(&iter->data[1],
1727                                 &iter->data[0],
1728                                 (void *) set - (void *) &iter->data[0]);
1729                         goto out;
1730                 }
1731
1732         used = __btree_node_iter_used(iter);
1733         BUG_ON(used >= ARRAY_SIZE(iter->data));
1734
1735         memmove(&iter->data[1],
1736                 &iter->data[0],
1737                 (void *) &iter->data[used] - (void *) &iter->data[0]);
1738 out:
1739         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1740         iter->data[0].end = end;
1741         return prev;
1742 }
1743
1744 struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
1745                                               struct btree *b)
1746 {
1747         struct bkey_packed *k;
1748
1749         do {
1750                 k = bch2_btree_node_iter_prev_all(iter, b);
1751         } while (k && bkey_deleted(k));
1752
1753         return k;
1754 }
1755
1756 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1757                                                  struct btree *b,
1758                                                  struct bkey *u)
1759 {
1760         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1761
1762         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1763 }
1764
1765 /* Mergesort */
1766
1767 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1768 {
1769         struct bset_tree *t;
1770
1771         for_each_bset(b, t) {
1772                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1773                 size_t j;
1774
1775                 stats->sets[type].nr++;
1776                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1777                         sizeof(u64);
1778
1779                 if (bset_has_ro_aux_tree(t)) {
1780                         stats->floats += t->size - 1;
1781
1782                         for (j = 1; j < t->size; j++)
1783                                 switch (bkey_float(b, t, j)->exponent) {
1784                                 case BFLOAT_FAILED_UNPACKED:
1785                                         stats->failed_unpacked++;
1786                                         break;
1787                                 case BFLOAT_FAILED_PREV:
1788                                         stats->failed_prev++;
1789                                         break;
1790                                 case BFLOAT_FAILED_OVERFLOW:
1791                                         stats->failed_overflow++;
1792                                         break;
1793                                 }
1794                 }
1795         }
1796 }
1797
1798 int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
1799                            char *buf, size_t size)
1800 {
1801         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1802         struct bkey_packed *l, *r, *p;
1803         struct bkey uk, up;
1804         char buf1[200], buf2[200];
1805         unsigned j;
1806
1807         if (!size)
1808                 return 0;
1809
1810         if (!bset_has_ro_aux_tree(t))
1811                 goto out;
1812
1813         j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
1814         if (j &&
1815             j < t->size &&
1816             k == tree_to_bkey(b, t, j))
1817                 switch (bkey_float(b, t, j)->exponent) {
1818                 case BFLOAT_FAILED_UNPACKED:
1819                         uk = bkey_unpack_key(b, k);
1820                         return scnprintf(buf, size,
1821                                          "    failed unpacked at depth %u\n"
1822                                          "\t%llu:%llu\n",
1823                                          ilog2(j),
1824                                          uk.p.inode, uk.p.offset);
1825                 case BFLOAT_FAILED_PREV:
1826                         p = tree_to_prev_bkey(b, t, j);
1827                         l = is_power_of_2(j)
1828                                 ? btree_bkey_first(b, t)
1829                                 : tree_to_prev_bkey(b, t, j >> ffs(j));
1830                         r = is_power_of_2(j + 1)
1831                                 ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
1832                                 : tree_to_bkey(b, t, j >> (ffz(j) + 1));
1833
1834                         up = bkey_unpack_key(b, p);
1835                         uk = bkey_unpack_key(b, k);
1836                         bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
1837                         bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
1838
1839                         return scnprintf(buf, size,
1840                                          "    failed prev at depth %u\n"
1841                                          "\tkey starts at bit %u but first differing bit at %u\n"
1842                                          "\t%llu:%llu\n"
1843                                          "\t%llu:%llu\n"
1844                                          "\t%s\n"
1845                                          "\t%s\n",
1846                                          ilog2(j),
1847                                          bch2_bkey_greatest_differing_bit(b, l, r),
1848                                          bch2_bkey_greatest_differing_bit(b, p, k),
1849                                          uk.p.inode, uk.p.offset,
1850                                          up.p.inode, up.p.offset,
1851                                          buf1, buf2);
1852                 case BFLOAT_FAILED_OVERFLOW:
1853                         uk = bkey_unpack_key(b, k);
1854                         return scnprintf(buf, size,
1855                                          "    failed overflow at depth %u\n"
1856                                          "\t%llu:%llu\n",
1857                                          ilog2(j),
1858                                          uk.p.inode, uk.p.offset);
1859                 }
1860 out:
1861         *buf = '\0';
1862         return 0;
1863 }