]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
Update bcachefs sources to e7f4678827 bcachefs: fix variable shadowing in macro call
[bcachefs-tools-debian] / libbcachefs / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "bset.h"
10 #include "eytzinger.h"
11 #include "util.h"
12
13 #include <asm/unaligned.h>
14 #include <linux/dynamic_fault.h>
15 #include <linux/console.h>
16 #include <linux/random.h>
17 #include <linux/prefetch.h>
18
19 /* hack.. */
20 #include "alloc_types.h"
21 #include <trace/events/bcachefs.h>
22
23 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
24 {
25         struct bset_tree *t;
26
27         for_each_bset(b, t)
28                 if (k >= btree_bkey_first(b, t) &&
29                     k < btree_bkey_last(b, t))
30                         return t;
31
32         BUG();
33 }
34
35 /*
36  * There are never duplicate live keys in the btree - but including keys that
37  * have been flagged as deleted (and will be cleaned up later) we _will_ see
38  * duplicates.
39  *
40  * Thus the sort order is: usual key comparison first, but for keys that compare
41  * equal the deleted key(s) come first, and the (at most one) live version comes
42  * last.
43  *
44  * The main reason for this is insertion: to handle overwrites, we first iterate
45  * over keys that compare equal to our insert key, and then insert immediately
46  * prior to the first key greater than the key we're inserting - our insert
47  * position will be after all keys that compare equal to our insert key, which
48  * by the time we actually do the insert will all be deleted.
49  */
50
51 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
52 {
53         struct bkey_packed *_k, *_n;
54         struct bkey k, n;
55         char buf[120];
56
57         if (!i->u64s)
58                 return;
59
60         for (_k = i->start, k = bkey_unpack_key(b, _k);
61              _k < vstruct_last(i);
62              _k = _n, k = n) {
63                 _n = bkey_next(_k);
64
65                 bch2_bkey_to_text(buf, sizeof(buf), &k);
66                 printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
67                        _k->_data - i->_data, i->u64s, buf);
68
69                 if (_n == vstruct_last(i))
70                         continue;
71
72                 n = bkey_unpack_key(b, _n);
73
74                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
75                         printk(KERN_ERR "Key skipped backwards\n");
76                         continue;
77                 }
78
79                 /*
80                  * Weird check for duplicate non extent keys: extents are
81                  * deleted iff they have 0 size, so if it has zero size and it's
82                  * not deleted these aren't extents:
83                  */
84                 if (((!k.size && !bkey_deleted(&k)) ||
85                      (!n.size && !bkey_deleted(&n))) &&
86                     !bkey_deleted(&k) &&
87                     !bkey_cmp(n.p, k.p))
88                         printk(KERN_ERR "Duplicate keys\n");
89         }
90 }
91
92 void bch2_dump_btree_node(struct btree *b)
93 {
94         struct bset_tree *t;
95
96         console_lock();
97         for_each_bset(b, t)
98                 bch2_dump_bset(b, bset(b, t), t - b->set);
99         console_unlock();
100 }
101
102 void bch2_dump_btree_node_iter(struct btree *b,
103                               struct btree_node_iter *iter)
104 {
105         struct btree_node_iter_set *set;
106
107         printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
108
109         btree_node_iter_for_each(iter, set) {
110                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
111                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
112                 struct bkey uk = bkey_unpack_key(b, k);
113                 char buf[100];
114
115                 bch2_bkey_to_text(buf, sizeof(buf), &uk);
116                 printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
117                        k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
118         }
119 }
120
121 #ifdef CONFIG_BCACHEFS_DEBUG
122
123 static bool keys_out_of_order(struct btree *b,
124                               const struct bkey_packed *prev,
125                               const struct bkey_packed *next,
126                               bool is_extents)
127 {
128         struct bkey nextu = bkey_unpack_key(b, next);
129
130         return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
131                 ((is_extents
132                   ? !bkey_deleted(next)
133                   : !bkey_deleted(prev)) &&
134                  !bkey_cmp_packed(b, prev, next));
135 }
136
137 void __bch2_verify_btree_nr_keys(struct btree *b)
138 {
139         struct bset_tree *t;
140         struct bkey_packed *k;
141         struct btree_nr_keys nr = { 0 };
142
143         for_each_bset(b, t)
144                 for (k = btree_bkey_first(b, t);
145                      k != btree_bkey_last(b, t);
146                      k = bkey_next(k))
147                         if (!bkey_whiteout(k))
148                                 btree_keys_account_key_add(&nr, t - b->set, k);
149
150         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
151 }
152
153 static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
154                                            struct btree *b,
155                                            struct bkey_packed *k)
156 {
157         const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
158
159         bkey_unpack_key(b, k);
160
161         if (n &&
162             keys_out_of_order(b, k, n, iter->is_extents)) {
163                 struct bkey ku = bkey_unpack_key(b, k);
164                 struct bkey nu = bkey_unpack_key(b, n);
165                 char buf1[80], buf2[80];
166
167                 bch2_dump_btree_node(b);
168                 bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
169                 bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
170                 panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
171         }
172 }
173
174 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
175                                 struct btree *b)
176 {
177         struct btree_node_iter_set *set, *prev = NULL;
178         struct bset_tree *t;
179         struct bkey_packed *k, *first;
180
181         if (bch2_btree_node_iter_end(iter))
182                 return;
183
184         btree_node_iter_for_each(iter, set) {
185                 k = __btree_node_offset_to_key(b, set->k);
186                 t = bch2_bkey_to_bset(b, k);
187
188                 BUG_ON(__btree_node_offset_to_key(b, set->end) !=
189                        btree_bkey_last(b, t));
190
191                 BUG_ON(prev &&
192                        btree_node_iter_cmp(iter, b, *prev, *set) > 0);
193
194                 prev = set;
195         }
196
197         first = __btree_node_offset_to_key(b, iter->data[0].k);
198
199         for_each_bset(b, t)
200                 if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
201                     btree_bkey_last(b, t) &&
202                     (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
203                         BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
204                                                      k, first) > 0);
205 }
206
207 void bch2_verify_key_order(struct btree *b,
208                           struct btree_node_iter *iter,
209                           struct bkey_packed *where)
210 {
211         struct bset_tree *t = bch2_bkey_to_bset(b, where);
212         struct bkey_packed *k, *prev;
213         struct bkey uk, uw = bkey_unpack_key(b, where);
214
215         k = bch2_bkey_prev_all(b, t, where);
216         if (k &&
217             keys_out_of_order(b, k, where, iter->is_extents)) {
218                 char buf1[100], buf2[100];
219
220                 bch2_dump_btree_node(b);
221                 uk = bkey_unpack_key(b, k);
222                 bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
223                 bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
224                 panic("out of order with prev:\n%s\n%s\n",
225                       buf1, buf2);
226         }
227
228         k = bkey_next(where);
229         BUG_ON(k != btree_bkey_last(b, t) &&
230                keys_out_of_order(b, where, k, iter->is_extents));
231
232         for_each_bset(b, t) {
233                 if (where >= btree_bkey_first(b, t) ||
234                     where < btree_bkey_last(b, t))
235                         continue;
236
237                 k = bch2_btree_node_iter_bset_pos(iter, b, t);
238
239                 if (k == btree_bkey_last(b, t))
240                         k = bch2_bkey_prev_all(b, t, k);
241
242                 while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
243                        (prev = bch2_bkey_prev_all(b, t, k)))
244                         k = prev;
245
246                 for (;
247                      k != btree_bkey_last(b, t);
248                      k = bkey_next(k)) {
249                         uk = bkey_unpack_key(b, k);
250
251                         if (iter->is_extents) {
252                                 BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
253                                          bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
254                         } else {
255                                 BUG_ON(!bkey_cmp(uw.p, uk.p) &&
256                                        !bkey_deleted(&uk));
257                         }
258
259                         if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
260                                 break;
261                 }
262         }
263 }
264
265 #else
266
267 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
268                                                    struct btree *b,
269                                                    struct bkey_packed *k) {}
270
271 #endif
272
273 /* Auxiliary search trees */
274
275 #define BFLOAT_FAILED_UNPACKED  (U8_MAX - 0)
276 #define BFLOAT_FAILED_PREV      (U8_MAX - 1)
277 #define BFLOAT_FAILED_OVERFLOW  (U8_MAX - 2)
278 #define BFLOAT_FAILED           (U8_MAX - 2)
279
280 #define KEY_WORDS               BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
281
282 struct bkey_float {
283         u8              exponent;
284         u8              key_offset;
285         union {
286                 u32     mantissa32;
287         struct {
288                 u16     mantissa16;
289                 u16     _pad;
290         };
291         };
292 } __packed;
293
294 #define BFLOAT_32BIT_NR         32U
295
296 static unsigned bkey_float_byte_offset(unsigned idx)
297 {
298         int d = (idx - BFLOAT_32BIT_NR) << 1;
299
300         d &= ~(d >> 31);
301
302         return idx * 6 - d;
303 }
304
305 struct ro_aux_tree {
306         struct bkey_float       _d[0];
307 };
308
309 struct rw_aux_tree {
310         u16             offset;
311         struct bpos     k;
312 };
313
314 /*
315  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
316  * it used to be 64, but I realized the lookup code would touch slightly less
317  * memory if it was 128.
318  *
319  * It definites the number of bytes (in struct bset) per struct bkey_float in
320  * the auxiliar search tree - when we're done searching the bset_float tree we
321  * have this many bytes left that we do a linear search over.
322  *
323  * Since (after level 5) every level of the bset_tree is on a new cacheline,
324  * we're touching one fewer cacheline in the bset tree in exchange for one more
325  * cacheline in the linear search - but the linear search might stop before it
326  * gets to the second cacheline.
327  */
328
329 #define BSET_CACHELINE          128
330
331 /* Space required for the btree node keys */
332 static inline size_t btree_keys_bytes(struct btree *b)
333 {
334         return PAGE_SIZE << b->page_order;
335 }
336
337 static inline size_t btree_keys_cachelines(struct btree *b)
338 {
339         return btree_keys_bytes(b) / BSET_CACHELINE;
340 }
341
342 static inline size_t btree_aux_data_bytes(struct btree *b)
343 {
344         return btree_keys_cachelines(b) * 8;
345 }
346
347 static inline size_t btree_aux_data_u64s(struct btree *b)
348 {
349         return btree_aux_data_bytes(b) / sizeof(u64);
350 }
351
352 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
353 {
354         BUG_ON(t->aux_data_offset == U16_MAX);
355
356         switch (bset_aux_tree_type(t)) {
357         case BSET_NO_AUX_TREE:
358                 return t->aux_data_offset;
359         case BSET_RO_AUX_TREE:
360                 return t->aux_data_offset +
361                         DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
362                                      sizeof(u8) * t->size, 8);
363         case BSET_RW_AUX_TREE:
364                 return t->aux_data_offset +
365                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
366         default:
367                 BUG();
368         }
369 }
370
371 static unsigned bset_aux_tree_buf_start(const struct btree *b,
372                                         const struct bset_tree *t)
373 {
374         return t == b->set
375                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
376                 : bset_aux_tree_buf_end(t - 1);
377 }
378
379 static void *__aux_tree_base(const struct btree *b,
380                              const struct bset_tree *t)
381 {
382         return b->aux_data + t->aux_data_offset * 8;
383 }
384
385 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
386                                             const struct bset_tree *t)
387 {
388         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
389
390         return __aux_tree_base(b, t);
391 }
392
393 static u8 *ro_aux_tree_prev(const struct btree *b,
394                             const struct bset_tree *t)
395 {
396         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
397
398         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
399 }
400
401 static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
402                                          unsigned idx)
403 {
404         return (void *) b + bkey_float_byte_offset(idx);
405 }
406
407 static struct bkey_float *bkey_float(const struct btree *b,
408                                      const struct bset_tree *t,
409                                      unsigned idx)
410 {
411         return bkey_float_get(ro_aux_tree_base(b, t), idx);
412 }
413
414 static void bset_aux_tree_verify(struct btree *b)
415 {
416 #ifdef CONFIG_BCACHEFS_DEBUG
417         struct bset_tree *t;
418
419         for_each_bset(b, t) {
420                 if (t->aux_data_offset == U16_MAX)
421                         continue;
422
423                 BUG_ON(t != b->set &&
424                        t[-1].aux_data_offset == U16_MAX);
425
426                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
427                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
428                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
429         }
430 #endif
431 }
432
433 /* Memory allocation */
434
435 void bch2_btree_keys_free(struct btree *b)
436 {
437         vfree(b->aux_data);
438         b->aux_data = NULL;
439 }
440
441 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
442 {
443         b->page_order   = page_order;
444         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
445                                     PAGE_KERNEL_EXEC);
446         if (!b->aux_data)
447                 return -ENOMEM;
448
449         return 0;
450 }
451
452 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
453 {
454         unsigned i;
455
456         b->nsets                = 0;
457         memset(&b->nr, 0, sizeof(b->nr));
458 #ifdef CONFIG_BCACHEFS_DEBUG
459         b->expensive_debug_checks = expensive_debug_checks;
460 #endif
461         for (i = 0; i < MAX_BSETS; i++)
462                 b->set[i].data_offset = U16_MAX;
463
464         bch2_bset_set_no_aux_tree(b, b->set);
465 }
466
467 /* Binary tree stuff for auxiliary search trees */
468
469 /*
470  * Cacheline/offset <-> bkey pointer arithmetic:
471  *
472  * t->tree is a binary search tree in an array; each node corresponds to a key
473  * in one cacheline in t->set (BSET_CACHELINE bytes).
474  *
475  * This means we don't have to store the full index of the key that a node in
476  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
477  * then bkey_float->m gives us the offset within that cacheline, in units of 8
478  * bytes.
479  *
480  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
481  * make this work.
482  *
483  * To construct the bfloat for an arbitrary key we need to know what the key
484  * immediately preceding it is: we have to check if the two keys differ in the
485  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
486  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
487  */
488
489 static inline void *bset_cacheline(const struct btree *b,
490                                    const struct bset_tree *t,
491                                    unsigned cacheline)
492 {
493         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
494                                    L1_CACHE_BYTES) +
495                 cacheline * BSET_CACHELINE;
496 }
497
498 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
499                                              const struct bset_tree *t,
500                                              unsigned cacheline,
501                                              unsigned offset)
502 {
503         return bset_cacheline(b, t, cacheline) + offset * 8;
504 }
505
506 static unsigned bkey_to_cacheline(const struct btree *b,
507                                   const struct bset_tree *t,
508                                   const struct bkey_packed *k)
509 {
510         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
511 }
512
513 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
514                                           const struct bset_tree *t,
515                                           unsigned cacheline,
516                                           const struct bkey_packed *k)
517 {
518         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
519 }
520
521 static unsigned bkey_to_cacheline_offset(const struct btree *b,
522                                          const struct bset_tree *t,
523                                          unsigned cacheline,
524                                          const struct bkey_packed *k)
525 {
526         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
527
528         EBUG_ON(m > U8_MAX);
529         return m;
530 }
531
532 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
533                                                const struct bset_tree *t,
534                                                unsigned j)
535 {
536         return cacheline_to_bkey(b, t,
537                         __eytzinger1_to_inorder(j, t->size, t->extra),
538                         bkey_float(b, t, j)->key_offset);
539 }
540
541 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
542                                              const struct bset_tree *t,
543                                              unsigned j)
544 {
545         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
546
547         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
548 }
549
550 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
551                                        const struct bset_tree *t)
552 {
553         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
554
555         return __aux_tree_base(b, t);
556 }
557
558 /*
559  * For the write set - the one we're currently inserting keys into - we don't
560  * maintain a full search tree, we just keep a simple lookup table in t->prev.
561  */
562 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
563                                           struct bset_tree *t,
564                                           unsigned j)
565 {
566         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
567 }
568
569 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
570                             unsigned j, struct bkey_packed *k)
571 {
572         EBUG_ON(k >= btree_bkey_last(b, t));
573
574         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
575                 .offset = __btree_node_key_to_offset(b, k),
576                 .k      = bkey_unpack_pos(b, k),
577         };
578 }
579
580 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
581                                         struct bset_tree *t)
582 {
583         struct bkey_packed *k = btree_bkey_first(b, t);
584         unsigned j = 0;
585
586         if (!btree_keys_expensive_checks(b))
587                 return;
588
589         BUG_ON(bset_has_ro_aux_tree(t));
590
591         if (!bset_has_rw_aux_tree(t))
592                 return;
593
594         BUG_ON(t->size < 1);
595         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
596
597         goto start;
598         while (1) {
599                 if (rw_aux_to_bkey(b, t, j) == k) {
600                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
601                                         bkey_unpack_pos(b, k)));
602 start:
603                         if (++j == t->size)
604                                 break;
605
606                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
607                                rw_aux_tree(b, t)[j - 1].offset);
608                 }
609
610                 k = bkey_next(k);
611                 BUG_ON(k >= btree_bkey_last(b, t));
612         }
613 }
614
615 /* returns idx of first entry >= offset: */
616 static unsigned rw_aux_tree_bsearch(struct btree *b,
617                                     struct bset_tree *t,
618                                     unsigned offset)
619 {
620         unsigned l = 0, r = t->size;
621
622         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
623
624         while (l < r) {
625                 unsigned m = (l + r) >> 1;
626
627                 if (rw_aux_tree(b, t)[m].offset < offset)
628                         l = m + 1;
629                 else
630                         r = m;
631         }
632
633         EBUG_ON(l < t->size &&
634                 rw_aux_tree(b, t)[l].offset < offset);
635         EBUG_ON(l &&
636                 rw_aux_tree(b, t)[l - 1].offset >= offset);
637
638         EBUG_ON(l > r);
639         EBUG_ON(l > t->size);
640
641         return l;
642 }
643
644 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
645                                        unsigned idx)
646 {
647         return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
648 }
649
650 static inline void bfloat_mantissa_set(struct bkey_float *f,
651                                        unsigned idx, unsigned mantissa)
652 {
653         if (idx < BFLOAT_32BIT_NR)
654                 f->mantissa32 = mantissa;
655         else
656                 f->mantissa16 = mantissa;
657 }
658
659 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
660                                      const struct bkey_float *f,
661                                      unsigned idx)
662 {
663         u64 v;
664
665         EBUG_ON(!bkey_packed(k));
666
667         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
668
669         /*
670          * In little endian, we're shifting off low bits (and then the bits we
671          * want are at the low end), in big endian we're shifting off high bits
672          * (and then the bits we want are at the high end, so we shift them
673          * back down):
674          */
675 #ifdef __LITTLE_ENDIAN
676         v >>= f->exponent & 7;
677 #else
678         v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
679 #endif
680         return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
681 }
682
683 static void make_bfloat(struct btree *b, struct bset_tree *t,
684                         unsigned j,
685                         struct bkey_packed *min_key,
686                         struct bkey_packed *max_key)
687 {
688         struct bkey_float *f = bkey_float(b, t, j);
689         struct bkey_packed *m = tree_to_bkey(b, t, j);
690         struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
691         struct bkey_packed *l, *r;
692         unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
693         unsigned mantissa;
694         int shift, exponent, high_bit;
695
696         EBUG_ON(bkey_next(p) != m);
697
698         if (is_power_of_2(j)) {
699                 l = min_key;
700
701                 if (!l->u64s) {
702                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
703                                 struct bkey_i tmp;
704
705                                 bkey_init(&tmp.k);
706                                 tmp.k.p = b->data->min_key;
707                                 bkey_copy(l, &tmp);
708                         }
709                 }
710         } else {
711                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
712
713                 EBUG_ON(m < l);
714         }
715
716         if (is_power_of_2(j + 1)) {
717                 r = max_key;
718
719                 if (!r->u64s) {
720                         if (!bkey_pack_pos(r, t->max_key, b)) {
721                                 struct bkey_i tmp;
722
723                                 bkey_init(&tmp.k);
724                                 tmp.k.p = t->max_key;
725                                 bkey_copy(r, &tmp);
726                         }
727                 }
728         } else {
729                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
730
731                 EBUG_ON(m > r);
732         }
733
734         /*
735          * for failed bfloats, the lookup code falls back to comparing against
736          * the original key.
737          */
738
739         if (!bkey_packed(l) || !bkey_packed(r) ||
740             !bkey_packed(p) || !bkey_packed(m) ||
741             !b->nr_key_bits) {
742                 f->exponent = BFLOAT_FAILED_UNPACKED;
743                 return;
744         }
745
746         /*
747          * The greatest differing bit of l and r is the first bit we must
748          * include in the bfloat mantissa we're creating in order to do
749          * comparisons - that bit always becomes the high bit of
750          * bfloat->mantissa, and thus the exponent we're calculating here is
751          * the position of what will become the low bit in bfloat->mantissa:
752          *
753          * Note that this may be negative - we may be running off the low end
754          * of the key: we handle this later:
755          */
756         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
757                        min_t(unsigned, bits, b->nr_key_bits) - 1);
758         exponent = high_bit - (bits - 1);
759
760         /*
761          * Then we calculate the actual shift value, from the start of the key
762          * (k->_data), to get the key bits starting at exponent:
763          */
764 #ifdef __LITTLE_ENDIAN
765         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
766
767         EBUG_ON(shift + bits > b->format.key_u64s * 64);
768 #else
769         shift = high_bit_offset +
770                 b->nr_key_bits -
771                 exponent -
772                 bits;
773
774         EBUG_ON(shift < KEY_PACKED_BITS_START);
775 #endif
776         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
777
778         f->exponent = shift;
779         mantissa = bkey_mantissa(m, f, j);
780
781         /*
782          * If we've got garbage bits, set them to all 1s - it's legal for the
783          * bfloat to compare larger than the original key, but not smaller:
784          */
785         if (exponent < 0)
786                 mantissa |= ~(~0U << -exponent);
787
788         bfloat_mantissa_set(f, j, mantissa);
789
790         /*
791          * The bfloat must be able to tell its key apart from the previous key -
792          * if its key and the previous key don't differ in the required bits,
793          * flag as failed - unless the keys are actually equal, in which case
794          * we aren't required to return a specific one:
795          */
796         if (exponent > 0 &&
797             bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
798             bkey_cmp_packed(b, p, m)) {
799                 f->exponent = BFLOAT_FAILED_PREV;
800                 return;
801         }
802
803         /*
804          * f->mantissa must compare >= the original key - for transitivity with
805          * the comparison in bset_search_tree. If we're dropping set bits,
806          * increment it:
807          */
808         if (exponent > (int) bch2_bkey_ffs(b, m)) {
809                 if (j < BFLOAT_32BIT_NR
810                     ? f->mantissa32 == U32_MAX
811                     : f->mantissa16 == U16_MAX)
812                         f->exponent = BFLOAT_FAILED_OVERFLOW;
813
814                 if (j < BFLOAT_32BIT_NR)
815                         f->mantissa32++;
816                 else
817                         f->mantissa16++;
818         }
819 }
820
821 /* bytes remaining - only valid for last bset: */
822 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
823 {
824         bset_aux_tree_verify(b);
825
826         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
827 }
828
829 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
830 {
831         unsigned bytes = __bset_tree_capacity(b, t);
832
833         if (bytes < 7 * BFLOAT_32BIT_NR)
834                 return bytes / 7;
835
836         bytes -= 7 * BFLOAT_32BIT_NR;
837
838         return BFLOAT_32BIT_NR + bytes / 5;
839 }
840
841 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
842 {
843         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
844 }
845
846 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
847 {
848         struct bkey_packed *k;
849
850         t->size = 1;
851         t->extra = BSET_RW_AUX_TREE_VAL;
852         rw_aux_tree(b, t)[0].offset =
853                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
854
855         for (k = btree_bkey_first(b, t);
856              k != btree_bkey_last(b, t);
857              k = bkey_next(k)) {
858                 if (t->size == bset_rw_tree_capacity(b, t))
859                         break;
860
861                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
862                     L1_CACHE_BYTES)
863                         rw_aux_tree_set(b, t, t->size++, k);
864         }
865 }
866
867 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
868 {
869         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
870         struct bkey_packed min_key, max_key;
871         unsigned j, cacheline = 1;
872
873         /* signal to make_bfloat() that they're uninitialized: */
874         min_key.u64s = max_key.u64s = 0;
875
876         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
877                       bset_ro_tree_capacity(b, t));
878 retry:
879         if (t->size < 2) {
880                 t->size = 0;
881                 t->extra = BSET_NO_AUX_TREE_VAL;
882                 return;
883         }
884
885         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
886
887         /* First we figure out where the first key in each cacheline is */
888         eytzinger1_for_each(j, t->size) {
889                 while (bkey_to_cacheline(b, t, k) < cacheline)
890                         prev = k, k = bkey_next(k);
891
892                 if (k >= btree_bkey_last(b, t)) {
893                         /* XXX: this path sucks */
894                         t->size--;
895                         goto retry;
896                 }
897
898                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
899                 bkey_float(b, t, j)->key_offset =
900                         bkey_to_cacheline_offset(b, t, cacheline++, k);
901
902                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
903                 EBUG_ON(tree_to_bkey(b, t, j) != k);
904         }
905
906         while (bkey_next(k) != btree_bkey_last(b, t))
907                 k = bkey_next(k);
908
909         t->max_key = bkey_unpack_pos(b, k);
910
911         /* Then we build the tree */
912         eytzinger1_for_each(j, t->size)
913                 make_bfloat(b, t, j, &min_key, &max_key);
914 }
915
916 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
917 {
918         struct bset_tree *i;
919
920         for (i = b->set; i != t; i++)
921                 BUG_ON(bset_has_rw_aux_tree(i));
922
923         bch2_bset_set_no_aux_tree(b, t);
924
925         /* round up to next cacheline: */
926         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
927                                       SMP_CACHE_BYTES / sizeof(u64));
928
929         bset_aux_tree_verify(b);
930 }
931
932 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
933                              bool writeable)
934 {
935         if (writeable
936             ? bset_has_rw_aux_tree(t)
937             : bset_has_ro_aux_tree(t))
938                 return;
939
940         bset_alloc_tree(b, t);
941
942         if (!__bset_tree_capacity(b, t))
943                 return;
944
945         if (writeable)
946                 __build_rw_aux_tree(b, t);
947         else
948                 __build_ro_aux_tree(b, t);
949
950         bset_aux_tree_verify(b);
951 }
952
953 void bch2_bset_init_first(struct btree *b, struct bset *i)
954 {
955         struct bset_tree *t;
956
957         BUG_ON(b->nsets);
958
959         memset(i, 0, sizeof(*i));
960         get_random_bytes(&i->seq, sizeof(i->seq));
961         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
962
963         t = &b->set[b->nsets++];
964         set_btree_bset(b, t, i);
965 }
966
967 void bch2_bset_init_next(struct btree *b, struct bset *i)
968 {
969         struct bset_tree *t;
970
971         BUG_ON(b->nsets >= MAX_BSETS);
972
973         memset(i, 0, sizeof(*i));
974         i->seq = btree_bset_first(b)->seq;
975         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
976
977         t = &b->set[b->nsets++];
978         set_btree_bset(b, t, i);
979 }
980
981 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
982                                        struct bkey_packed *k)
983 {
984         struct bkey_packed *p;
985         unsigned offset;
986         int j;
987
988         EBUG_ON(k < btree_bkey_first(b, t) ||
989                 k > btree_bkey_last(b, t));
990
991         if (k == btree_bkey_first(b, t))
992                 return NULL;
993
994         switch (bset_aux_tree_type(t)) {
995         case BSET_NO_AUX_TREE:
996                 p = btree_bkey_first(b, t);
997                 break;
998         case BSET_RO_AUX_TREE:
999                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
1000
1001                 do {
1002                         p = j ? tree_to_bkey(b, t,
1003                                         __inorder_to_eytzinger1(j--,
1004                                                         t->size, t->extra))
1005                               : btree_bkey_first(b, t);
1006                 } while (p >= k);
1007                 break;
1008         case BSET_RW_AUX_TREE:
1009                 offset = __btree_node_key_to_offset(b, k);
1010                 j = rw_aux_tree_bsearch(b, t, offset);
1011                 p = j ? rw_aux_to_bkey(b, t, j - 1)
1012                       : btree_bkey_first(b, t);
1013                 break;
1014         }
1015
1016         return p;
1017 }
1018
1019 struct bkey_packed *bch2_bkey_prev_all(struct btree *b, struct bset_tree *t,
1020                                        struct bkey_packed *k)
1021 {
1022         struct bkey_packed *p;
1023
1024         p = __bkey_prev(b, t, k);
1025         if (!p)
1026                 return NULL;
1027
1028         while (bkey_next(p) != k)
1029                 p = bkey_next(p);
1030
1031         return p;
1032 }
1033
1034 struct bkey_packed *bch2_bkey_prev(struct btree *b, struct bset_tree *t,
1035                                    struct bkey_packed *k)
1036 {
1037         while (1) {
1038                 struct bkey_packed *p, *i, *ret = NULL;
1039
1040                 p = __bkey_prev(b, t, k);
1041                 if (!p)
1042                         return NULL;
1043
1044                 for (i = p; i != k; i = bkey_next(i))
1045                         if (!bkey_deleted(i))
1046                                 ret = i;
1047
1048                 if (ret)
1049                         return ret;
1050
1051                 k = p;
1052         }
1053 }
1054
1055 /* Insert */
1056
1057 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1058                                             struct bset_tree *t,
1059                                             struct bkey_packed *k)
1060 {
1061         unsigned offset = __btree_node_key_to_offset(b, k);
1062         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1063
1064         if (j < t->size &&
1065             rw_aux_tree(b, t)[j].offset == offset)
1066                 rw_aux_tree_set(b, t, j, k);
1067
1068         bch2_bset_verify_rw_aux_tree(b, t);
1069 }
1070
1071 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1072                                             struct bset_tree *t,
1073                                             struct bkey_packed *k)
1074 {
1075         struct bkey_packed min_key, max_key;
1076         unsigned inorder, j;
1077
1078         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1079
1080         /* signal to make_bfloat() that they're uninitialized: */
1081         min_key.u64s = max_key.u64s = 0;
1082
1083         if (bkey_next(k) == btree_bkey_last(b, t)) {
1084                 t->max_key = bkey_unpack_pos(b, k);
1085
1086                 for (j = 1; j < t->size; j = j * 2 + 1)
1087                         make_bfloat(b, t, j, &min_key, &max_key);
1088         }
1089
1090         inorder = bkey_to_cacheline(b, t, k);
1091
1092         if (inorder &&
1093             inorder < t->size) {
1094                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1095
1096                 if (k == tree_to_bkey(b, t, j)) {
1097                         /* Fix the node this key corresponds to */
1098                         make_bfloat(b, t, j, &min_key, &max_key);
1099
1100                         /* Children for which this key is the right boundary */
1101                         for (j = eytzinger1_left_child(j);
1102                              j < t->size;
1103                              j = eytzinger1_right_child(j))
1104                                 make_bfloat(b, t, j, &min_key, &max_key);
1105                 }
1106         }
1107
1108         if (inorder + 1 < t->size) {
1109                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1110
1111                 if (k == tree_to_prev_bkey(b, t, j)) {
1112                         make_bfloat(b, t, j, &min_key, &max_key);
1113
1114                         /* Children for which this key is the left boundary */
1115                         for (j = eytzinger1_right_child(j);
1116                              j < t->size;
1117                              j = eytzinger1_left_child(j))
1118                                 make_bfloat(b, t, j, &min_key, &max_key);
1119                 }
1120         }
1121 }
1122
1123 /**
1124  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1125  * modified, fix any auxiliary search tree by remaking all the nodes in the
1126  * auxiliary search tree that @k corresponds to
1127  */
1128 void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
1129                                    struct bkey_packed *k)
1130 {
1131         switch (bset_aux_tree_type(t)) {
1132         case BSET_NO_AUX_TREE:
1133                 break;
1134         case BSET_RO_AUX_TREE:
1135                 ro_aux_tree_fix_invalidated_key(b, t, k);
1136                 break;
1137         case BSET_RW_AUX_TREE:
1138                 rw_aux_tree_fix_invalidated_key(b, t, k);
1139                 break;
1140         }
1141 }
1142
1143 static void bch2_bset_fix_lookup_table(struct btree *b,
1144                                        struct bset_tree *t,
1145                                        struct bkey_packed *_where,
1146                                        unsigned clobber_u64s,
1147                                        unsigned new_u64s)
1148 {
1149         int shift = new_u64s - clobber_u64s;
1150         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1151
1152         EBUG_ON(bset_has_ro_aux_tree(t));
1153
1154         if (!bset_has_rw_aux_tree(t))
1155                 return;
1156
1157         l = rw_aux_tree_bsearch(b, t, where);
1158
1159         /* l is first >= than @where */
1160
1161         EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
1162         EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
1163
1164         if (!l) /* never delete first entry */
1165                 l++;
1166         else if (l < t->size &&
1167                  where < t->end_offset &&
1168                  rw_aux_tree(b, t)[l].offset == where)
1169                 rw_aux_tree_set(b, t, l++, _where);
1170
1171         /* l now > where */
1172
1173         for (j = l;
1174              j < t->size &&
1175              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1176              j++)
1177                 ;
1178
1179         if (j < t->size &&
1180             rw_aux_tree(b, t)[j].offset + shift ==
1181             rw_aux_tree(b, t)[l - 1].offset)
1182                 j++;
1183
1184         memmove(&rw_aux_tree(b, t)[l],
1185                 &rw_aux_tree(b, t)[j],
1186                 (void *) &rw_aux_tree(b, t)[t->size] -
1187                 (void *) &rw_aux_tree(b, t)[j]);
1188         t->size -= j - l;
1189
1190         for (j = l; j < t->size; j++)
1191                rw_aux_tree(b, t)[j].offset += shift;
1192
1193         EBUG_ON(l < t->size &&
1194                 rw_aux_tree(b, t)[l].offset ==
1195                 rw_aux_tree(b, t)[l - 1].offset);
1196
1197         if (t->size < bset_rw_tree_capacity(b, t) &&
1198             (l < t->size
1199              ? rw_aux_tree(b, t)[l].offset
1200              : t->end_offset) -
1201             rw_aux_tree(b, t)[l - 1].offset >
1202             L1_CACHE_BYTES / sizeof(u64)) {
1203                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1204                 struct bkey_packed *end = l < t->size
1205                         ? rw_aux_to_bkey(b, t, l)
1206                         : btree_bkey_last(b, t);
1207                 struct bkey_packed *k = start;
1208
1209                 while (1) {
1210                         k = bkey_next(k);
1211                         if (k == end)
1212                                 break;
1213
1214                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1215                                 memmove(&rw_aux_tree(b, t)[l + 1],
1216                                         &rw_aux_tree(b, t)[l],
1217                                         (void *) &rw_aux_tree(b, t)[t->size] -
1218                                         (void *) &rw_aux_tree(b, t)[l]);
1219                                 t->size++;
1220                                 rw_aux_tree_set(b, t, l, k);
1221                                 break;
1222                         }
1223                 }
1224         }
1225
1226         bch2_bset_verify_rw_aux_tree(b, t);
1227         bset_aux_tree_verify(b);
1228 }
1229
1230 void bch2_bset_insert(struct btree *b,
1231                       struct btree_node_iter *iter,
1232                       struct bkey_packed *where,
1233                       struct bkey_i *insert,
1234                       unsigned clobber_u64s)
1235 {
1236         struct bkey_format *f = &b->format;
1237         struct bset_tree *t = bset_tree_last(b);
1238         struct bkey_packed packed, *src = bkey_to_packed(insert);
1239
1240         bch2_bset_verify_rw_aux_tree(b, t);
1241
1242         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1243                 src = &packed;
1244
1245         if (!bkey_whiteout(&insert->k))
1246                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1247
1248         if (src->u64s != clobber_u64s) {
1249                 u64 *src_p = where->_data + clobber_u64s;
1250                 u64 *dst_p = where->_data + src->u64s;
1251
1252                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1253                         (int) clobber_u64s - src->u64s);
1254
1255                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1256                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1257                 set_btree_bset_end(b, t);
1258         }
1259
1260         memcpy_u64s(where, src,
1261                     bkeyp_key_u64s(f, src));
1262         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1263                     bkeyp_val_u64s(f, src));
1264
1265         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1266
1267         bch2_verify_key_order(b, iter, where);
1268         bch2_verify_btree_nr_keys(b);
1269 }
1270
1271 void bch2_bset_delete(struct btree *b,
1272                       struct bkey_packed *where,
1273                       unsigned clobber_u64s)
1274 {
1275         struct bset_tree *t = bset_tree_last(b);
1276         u64 *src_p = where->_data + clobber_u64s;
1277         u64 *dst_p = where->_data;
1278
1279         bch2_bset_verify_rw_aux_tree(b, t);
1280
1281         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1282
1283         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1284         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1285         set_btree_bset_end(b, t);
1286
1287         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1288 }
1289
1290 /* Lookup */
1291
1292 __flatten
1293 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1294                                 struct bset_tree *t,
1295                                 struct bpos search,
1296                                 const struct bkey_packed *packed_search)
1297 {
1298         unsigned l = 0, r = t->size;
1299
1300         while (l + 1 != r) {
1301                 unsigned m = (l + r) >> 1;
1302
1303                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
1304                         l = m;
1305                 else
1306                         r = m;
1307         }
1308
1309         return rw_aux_to_bkey(b, t, l);
1310 }
1311
1312 noinline
1313 static int bset_search_tree_slowpath(const struct btree *b,
1314                                 struct bset_tree *t, struct bpos *search,
1315                                 const struct bkey_packed *packed_search,
1316                                 unsigned n)
1317 {
1318         return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
1319                                  packed_search, search) < 0;
1320 }
1321
1322 __flatten
1323 static struct bkey_packed *bset_search_tree(const struct btree *b,
1324                                 struct bset_tree *t,
1325                                 struct bpos search,
1326                                 const struct bkey_packed *packed_search)
1327 {
1328         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1329         struct bkey_float *f = bkey_float_get(base, 1);
1330         void *p;
1331         unsigned inorder, n = 1;
1332
1333         while (1) {
1334                 if (likely(n << 4 < t->size)) {
1335                         p = bkey_float_get(base, n << 4);
1336                         prefetch(p);
1337                 } else if (n << 3 < t->size) {
1338                         inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
1339                         p = bset_cacheline(b, t, inorder);
1340 #ifdef CONFIG_X86_64
1341                         asm(".intel_syntax noprefix;"
1342                             "prefetcht0 [%0 - 127 + 64 * 0];"
1343                             "prefetcht0 [%0 - 127 + 64 * 1];"
1344                             "prefetcht0 [%0 - 127 + 64 * 2];"
1345                             "prefetcht0 [%0 - 127 + 64 * 3];"
1346                             ".att_syntax prefix;"
1347                             :
1348                             : "r" (p + 127));
1349 #else
1350                         prefetch(p + L1_CACHE_BYTES * 0);
1351                         prefetch(p + L1_CACHE_BYTES * 1);
1352                         prefetch(p + L1_CACHE_BYTES * 2);
1353                         prefetch(p + L1_CACHE_BYTES * 3);
1354 #endif
1355                 } else if (n >= t->size)
1356                         break;
1357
1358                 f = bkey_float_get(base, n);
1359
1360                 if (packed_search &&
1361                     likely(f->exponent < BFLOAT_FAILED))
1362                         n = n * 2 + (bfloat_mantissa(f, n) <
1363                                      bkey_mantissa(packed_search, f, n));
1364                 else
1365                         n = n * 2 + bset_search_tree_slowpath(b, t,
1366                                                 &search, packed_search, n);
1367         } while (n < t->size);
1368
1369         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1370
1371         /*
1372          * n would have been the node we recursed to - the low bit tells us if
1373          * we recursed left or recursed right.
1374          */
1375         if (n & 1) {
1376                 return cacheline_to_bkey(b, t, inorder, f->key_offset);
1377         } else {
1378                 if (--inorder) {
1379                         n = eytzinger1_prev(n >> 1, t->size);
1380                         f = bkey_float_get(base, n);
1381                         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1382                 } else
1383                         return btree_bkey_first(b, t);
1384         }
1385 }
1386
1387 /*
1388  * Returns the first key greater than or equal to @search
1389  */
1390 __always_inline __flatten
1391 static struct bkey_packed *bch2_bset_search(struct btree *b,
1392                                 struct bset_tree *t,
1393                                 struct bpos search,
1394                                 struct bkey_packed *packed_search,
1395                                 const struct bkey_packed *lossy_packed_search,
1396                                 bool strictly_greater)
1397 {
1398         struct bkey_packed *m;
1399
1400         /*
1401          * First, we search for a cacheline, then lastly we do a linear search
1402          * within that cacheline.
1403          *
1404          * To search for the cacheline, there's three different possibilities:
1405          *  * The set is too small to have a search tree, so we just do a linear
1406          *    search over the whole set.
1407          *  * The set is the one we're currently inserting into; keeping a full
1408          *    auxiliary search tree up to date would be too expensive, so we
1409          *    use a much simpler lookup table to do a binary search -
1410          *    bset_search_write_set().
1411          *  * Or we use the auxiliary search tree we constructed earlier -
1412          *    bset_search_tree()
1413          */
1414
1415         switch (bset_aux_tree_type(t)) {
1416         case BSET_NO_AUX_TREE:
1417                 m = btree_bkey_first(b, t);
1418                 break;
1419         case BSET_RW_AUX_TREE:
1420                 m = bset_search_write_set(b, t, search, lossy_packed_search);
1421                 break;
1422         case BSET_RO_AUX_TREE:
1423                 /*
1424                  * Each node in the auxiliary search tree covers a certain range
1425                  * of bits, and keys above and below the set it covers might
1426                  * differ outside those bits - so we have to special case the
1427                  * start and end - handle that here:
1428                  */
1429
1430                 if (bkey_cmp(search, t->max_key) > 0)
1431                         return btree_bkey_last(b, t);
1432
1433                 m = bset_search_tree(b, t, search, lossy_packed_search);
1434                 break;
1435         }
1436
1437         if (lossy_packed_search)
1438                 while (m != btree_bkey_last(b, t) &&
1439                        !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
1440                                                     m, strictly_greater))
1441                         m = bkey_next(m);
1442
1443         if (!packed_search)
1444                 while (m != btree_bkey_last(b, t) &&
1445                        !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
1446                         m = bkey_next(m);
1447
1448         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1449                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1450
1451                 BUG_ON(prev &&
1452                        btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
1453                                                    prev, strictly_greater));
1454         }
1455
1456         return m;
1457 }
1458
1459 /* Btree node iterator */
1460
1461 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1462                                struct btree *b,
1463                                const struct bkey_packed *k,
1464                                const struct bkey_packed *end)
1465 {
1466         __bch2_btree_node_iter_push(iter, b, k, end);
1467         bch2_btree_node_iter_sort(iter, b);
1468 }
1469
1470 noinline __flatten __attribute__((cold))
1471 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1472                               struct btree *b, struct bpos search,
1473                               bool strictly_greater, bool is_extents)
1474 {
1475         struct bset_tree *t;
1476
1477         trace_bkey_pack_pos_fail(search);
1478
1479         for_each_bset(b, t)
1480                 __bch2_btree_node_iter_push(iter, b,
1481                         bch2_bset_search(b, t, search, NULL, NULL,
1482                                         strictly_greater),
1483                         btree_bkey_last(b, t));
1484
1485         bch2_btree_node_iter_sort(iter, b);
1486 }
1487
1488 /**
1489  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1490  * given position
1491  *
1492  * Main entry point to the lookup code for individual btree nodes:
1493  *
1494  * NOTE:
1495  *
1496  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1497  * keys. This doesn't matter for most code, but it does matter for lookups.
1498  *
1499  * Some adjacent keys with a string of equal keys:
1500  *      i j k k k k l m
1501  *
1502  * If you search for k, the lookup code isn't guaranteed to return you any
1503  * specific k. The lookup code is conceptually doing a binary search and
1504  * iterating backwards is very expensive so if the pivot happens to land at the
1505  * last k that's what you'll get.
1506  *
1507  * This works out ok, but it's something to be aware of:
1508  *
1509  *  - For non extents, we guarantee that the live key comes last - see
1510  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1511  *    see will only be deleted keys you don't care about.
1512  *
1513  *  - For extents, deleted keys sort last (see the comment at the top of this
1514  *    file). But when you're searching for extents, you actually want the first
1515  *    key strictly greater than your search key - an extent that compares equal
1516  *    to the search key is going to have 0 sectors after the search key.
1517  *
1518  *    But this does mean that we can't just search for
1519  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1520  *    the range we want - if we're unlucky and there's an extent that ends
1521  *    exactly where we searched, then there could be a deleted key at the same
1522  *    position and we'd get that when we search instead of the preceding extent
1523  *    we needed.
1524  *
1525  *    So we've got to search for start_of_range, then after the lookup iterate
1526  *    past any extents that compare equal to the position we searched for.
1527  */
1528 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1529                                struct btree *b, struct bpos search,
1530                                bool strictly_greater, bool is_extents)
1531 {
1532         struct bset_tree *t;
1533         struct bkey_packed p, *packed_search = NULL;
1534
1535         EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
1536         bset_aux_tree_verify(b);
1537
1538         __bch2_btree_node_iter_init(iter, is_extents);
1539
1540         switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
1541         case BKEY_PACK_POS_EXACT:
1542                 packed_search = &p;
1543                 break;
1544         case BKEY_PACK_POS_SMALLER:
1545                 packed_search = NULL;
1546                 break;
1547         case BKEY_PACK_POS_FAIL:
1548                 btree_node_iter_init_pack_failed(iter, b, search,
1549                                         strictly_greater, is_extents);
1550                 return;
1551         }
1552
1553         for_each_bset(b, t)
1554                 __bch2_btree_node_iter_push(iter, b,
1555                                            bch2_bset_search(b, t, search,
1556                                                            packed_search, &p,
1557                                                            strictly_greater),
1558                                            btree_bkey_last(b, t));
1559
1560         bch2_btree_node_iter_sort(iter, b);
1561 }
1562
1563 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1564                                           struct btree *b,
1565                                           bool is_extents)
1566 {
1567         struct bset_tree *t;
1568
1569         __bch2_btree_node_iter_init(iter, is_extents);
1570
1571         for_each_bset(b, t)
1572                 __bch2_btree_node_iter_push(iter, b,
1573                                            btree_bkey_first(b, t),
1574                                            btree_bkey_last(b, t));
1575         bch2_btree_node_iter_sort(iter, b);
1576 }
1577
1578 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1579                                                   struct btree *b,
1580                                                   struct bset_tree *t)
1581 {
1582         struct btree_node_iter_set *set;
1583
1584         btree_node_iter_for_each(iter, set)
1585                 if (set->end == t->end_offset)
1586                         return __btree_node_offset_to_key(b, set->k);
1587
1588         return btree_bkey_last(b, t);
1589 }
1590
1591 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1592                                             struct btree *b,
1593                                             unsigned first)
1594 {
1595         bool ret;
1596
1597         if ((ret = (btree_node_iter_cmp(iter, b,
1598                                         iter->data[first],
1599                                         iter->data[first + 1]) > 0)))
1600                 swap(iter->data[first], iter->data[first + 1]);
1601         return ret;
1602 }
1603
1604 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1605                                struct btree *b)
1606 {
1607         /* unrolled bubble sort: */
1608
1609         if (!__btree_node_iter_set_end(iter, 2)) {
1610                 btree_node_iter_sort_two(iter, b, 0);
1611                 btree_node_iter_sort_two(iter, b, 1);
1612         }
1613
1614         if (!__btree_node_iter_set_end(iter, 1))
1615                 btree_node_iter_sort_two(iter, b, 0);
1616 }
1617
1618 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1619                                    struct btree_node_iter_set *set)
1620 {
1621         struct btree_node_iter_set *last =
1622                 iter->data + ARRAY_SIZE(iter->data) - 1;
1623
1624         memmove(&set[0], &set[1], (void *) last - (void *) set);
1625         *last = (struct btree_node_iter_set) { 0, 0 };
1626 }
1627
1628 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1629                                                   struct btree *b)
1630 {
1631         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1632
1633         EBUG_ON(iter->data->k > iter->data->end);
1634
1635         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1636                 bch2_btree_node_iter_set_drop(iter, iter->data);
1637                 return;
1638         }
1639
1640         if (__btree_node_iter_set_end(iter, 1))
1641                 return;
1642
1643         if (!btree_node_iter_sort_two(iter, b, 0))
1644                 return;
1645
1646         if (__btree_node_iter_set_end(iter, 2))
1647                 return;
1648
1649         btree_node_iter_sort_two(iter, b, 1);
1650 }
1651
1652 /**
1653  * bch_btree_node_iter_advance - advance @iter by one key
1654  *
1655  * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
1656  * momentarily have out of order extents.
1657  */
1658 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1659                                   struct btree *b)
1660 {
1661 #ifdef CONFIG_BCACHEFS_DEBUG
1662         struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
1663
1664         __bch2_btree_node_iter_advance(iter, b);
1665         bch2_btree_node_iter_next_check(iter, b, k);
1666 #else
1667         __bch2_btree_node_iter_advance(iter, b);
1668 #endif
1669 }
1670
1671 static inline bool __btree_node_iter_used(struct btree_node_iter *iter)
1672 {
1673         unsigned n = ARRAY_SIZE(iter->data);
1674
1675         while (n && __btree_node_iter_set_end(iter, n - 1))
1676                 --n;
1677
1678         return n;
1679 }
1680
1681 /*
1682  * Expensive:
1683  */
1684 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
1685                                                   struct btree *b)
1686 {
1687         struct bkey_packed *k, *prev = NULL;
1688         struct btree_node_iter_set *set;
1689         struct bset_tree *t;
1690         struct bset_tree *prev_t;
1691         unsigned end, used;
1692
1693         bch2_btree_node_iter_verify(iter, b);
1694
1695         for_each_bset(b, t) {
1696                 k = bch2_bkey_prev_all(b, t,
1697                         bch2_btree_node_iter_bset_pos(iter, b, t));
1698                 if (k &&
1699                     (!prev || __btree_node_iter_cmp(iter->is_extents, b,
1700                                                     k, prev) > 0)) {
1701                         prev = k;
1702                         prev_t = t;
1703                 }
1704         }
1705
1706         if (!prev)
1707                 return NULL;
1708
1709         /*
1710          * We're manually memmoving instead of just calling sort() to ensure the
1711          * prev we picked ends up in slot 0 - sort won't necessarily put it
1712          * there because of duplicate deleted keys:
1713          */
1714         end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
1715         btree_node_iter_for_each(iter, set)
1716                 if (set->end == end) {
1717                         memmove(&iter->data[1],
1718                                 &iter->data[0],
1719                                 (void *) set - (void *) &iter->data[0]);
1720                         goto out;
1721                 }
1722
1723         used = __btree_node_iter_used(iter);
1724         BUG_ON(used >= ARRAY_SIZE(iter->data));
1725
1726         memmove(&iter->data[1],
1727                 &iter->data[0],
1728                 (void *) &iter->data[used] - (void *) &iter->data[0]);
1729 out:
1730         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1731         iter->data[0].end = end;
1732         return prev;
1733 }
1734
1735 struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
1736                                               struct btree *b)
1737 {
1738         struct bkey_packed *k;
1739
1740         do {
1741                 k = bch2_btree_node_iter_prev_all(iter, b);
1742         } while (k && bkey_deleted(k));
1743
1744         return k;
1745 }
1746
1747 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1748                                                  struct btree *b,
1749                                                  struct bkey *u)
1750 {
1751         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1752
1753         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1754 }
1755
1756 /* Mergesort */
1757
1758 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1759 {
1760         struct bset_tree *t;
1761
1762         for_each_bset(b, t) {
1763                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1764                 size_t j;
1765
1766                 stats->sets[type].nr++;
1767                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1768                         sizeof(u64);
1769
1770                 if (bset_has_ro_aux_tree(t)) {
1771                         stats->floats += t->size - 1;
1772
1773                         for (j = 1; j < t->size; j++)
1774                                 switch (bkey_float(b, t, j)->exponent) {
1775                                 case BFLOAT_FAILED_UNPACKED:
1776                                         stats->failed_unpacked++;
1777                                         break;
1778                                 case BFLOAT_FAILED_PREV:
1779                                         stats->failed_prev++;
1780                                         break;
1781                                 case BFLOAT_FAILED_OVERFLOW:
1782                                         stats->failed_overflow++;
1783                                         break;
1784                                 }
1785                 }
1786         }
1787 }
1788
1789 int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
1790                            char *buf, size_t size)
1791 {
1792         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1793         struct bkey_packed *l, *r, *p;
1794         struct bkey uk, up;
1795         char buf1[200], buf2[200];
1796         unsigned j;
1797
1798         if (!size)
1799                 return 0;
1800
1801         if (!bset_has_ro_aux_tree(t))
1802                 goto out;
1803
1804         j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
1805         if (j &&
1806             j < t->size &&
1807             k == tree_to_bkey(b, t, j))
1808                 switch (bkey_float(b, t, j)->exponent) {
1809                 case BFLOAT_FAILED_UNPACKED:
1810                         uk = bkey_unpack_key(b, k);
1811                         return scnprintf(buf, size,
1812                                          "    failed unpacked at depth %u\n"
1813                                          "\t%llu:%llu\n",
1814                                          ilog2(j),
1815                                          uk.p.inode, uk.p.offset);
1816                 case BFLOAT_FAILED_PREV:
1817                         p = tree_to_prev_bkey(b, t, j);
1818                         l = is_power_of_2(j)
1819                                 ? btree_bkey_first(b, t)
1820                                 : tree_to_prev_bkey(b, t, j >> ffs(j));
1821                         r = is_power_of_2(j + 1)
1822                                 ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
1823                                 : tree_to_bkey(b, t, j >> (ffz(j) + 1));
1824
1825                         up = bkey_unpack_key(b, p);
1826                         uk = bkey_unpack_key(b, k);
1827                         bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
1828                         bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
1829
1830                         return scnprintf(buf, size,
1831                                          "    failed prev at depth %u\n"
1832                                          "\tkey starts at bit %u but first differing bit at %u\n"
1833                                          "\t%llu:%llu\n"
1834                                          "\t%llu:%llu\n"
1835                                          "\t%s\n"
1836                                          "\t%s\n",
1837                                          ilog2(j),
1838                                          bch2_bkey_greatest_differing_bit(b, l, r),
1839                                          bch2_bkey_greatest_differing_bit(b, p, k),
1840                                          uk.p.inode, uk.p.offset,
1841                                          up.p.inode, up.p.offset,
1842                                          buf1, buf2);
1843                 case BFLOAT_FAILED_OVERFLOW:
1844                         uk = bkey_unpack_key(b, k);
1845                         return scnprintf(buf, size,
1846                                          "    failed overflow at depth %u\n"
1847                                          "\t%llu:%llu\n",
1848                                          ilog2(j),
1849                                          uk.p.inode, uk.p.offset);
1850                 }
1851 out:
1852         *buf = '\0';
1853         return 0;
1854 }