]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/bset.c
update bcache sources
[bcachefs-tools-debian] / libbcache / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
9
10 #include "eytzinger.h"
11 #include "util.h"
12 #include "bset.h"
13
14 #include <asm/unaligned.h>
15 #include <linux/dynamic_fault.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcache.h>
23
24 struct bset_tree *bch_bkey_to_bset(struct btree *b, struct bkey_packed *k)
25 {
26         struct bset_tree *t;
27
28         for_each_bset(b, t)
29                 if (k >= btree_bkey_first(b, t) &&
30                     k < btree_bkey_last(b, t))
31                         return t;
32
33         BUG();
34 }
35
36 /*
37  * There are never duplicate live keys in the btree - but including keys that
38  * have been flagged as deleted (and will be cleaned up later) we _will_ see
39  * duplicates.
40  *
41  * Thus the sort order is: usual key comparison first, but for keys that compare
42  * equal the deleted key(s) come first, and the (at most one) live version comes
43  * last.
44  *
45  * The main reason for this is insertion: to handle overwrites, we first iterate
46  * over keys that compare equal to our insert key, and then insert immediately
47  * prior to the first key greater than the key we're inserting - our insert
48  * position will be after all keys that compare equal to our insert key, which
49  * by the time we actually do the insert will all be deleted.
50  */
51
52 void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
53 {
54         struct bkey_packed *_k, *_n;
55         struct bkey k, n;
56         char buf[120];
57
58         if (!i->u64s)
59                 return;
60
61         for (_k = i->start, k = bkey_unpack_key(b, _k);
62              _k < vstruct_last(i);
63              _k = _n, k = n) {
64                 _n = bkey_next(_k);
65
66                 bch_bkey_to_text(buf, sizeof(buf), &k);
67                 printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
68                        _k->_data - i->_data, i->u64s, buf);
69
70                 if (_n == vstruct_last(i))
71                         continue;
72
73                 n = bkey_unpack_key(b, _n);
74
75                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
76                         printk(KERN_ERR "Key skipped backwards\n");
77                         continue;
78                 }
79
80                 /*
81                  * Weird check for duplicate non extent keys: extents are
82                  * deleted iff they have 0 size, so if it has zero size and it's
83                  * not deleted these aren't extents:
84                  */
85                 if (((!k.size && !bkey_deleted(&k)) ||
86                      (!n.size && !bkey_deleted(&n))) &&
87                     !bkey_deleted(&k) &&
88                     !bkey_cmp(n.p, k.p))
89                         printk(KERN_ERR "Duplicate keys\n");
90         }
91 }
92
93 void bch_dump_btree_node(struct btree *b)
94 {
95         struct bset_tree *t;
96
97         console_lock();
98         for_each_bset(b, t)
99                 bch_dump_bset(b, bset(b, t), t - b->set);
100         console_unlock();
101 }
102
103 void bch_dump_btree_node_iter(struct btree *b,
104                               struct btree_node_iter *iter)
105 {
106         struct btree_node_iter_set *set;
107
108         printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
109
110         btree_node_iter_for_each(iter, set) {
111                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
112                 struct bset_tree *t = bch_bkey_to_bset(b, k);
113                 struct bkey uk = bkey_unpack_key(b, k);
114                 char buf[100];
115
116                 bch_bkey_to_text(buf, sizeof(buf), &uk);
117                 printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
118                        k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
119         }
120 }
121
122 #ifdef CONFIG_BCACHE_DEBUG
123
124 static bool keys_out_of_order(struct btree *b,
125                               const struct bkey_packed *prev,
126                               const struct bkey_packed *next,
127                               bool is_extents)
128 {
129         struct bkey nextu = bkey_unpack_key(b, next);
130
131         return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
132                 ((is_extents
133                   ? !bkey_deleted(next)
134                   : !bkey_deleted(prev)) &&
135                  !bkey_cmp_packed(b, prev, next));
136 }
137
138 void __bch_verify_btree_nr_keys(struct btree *b)
139 {
140         struct bset_tree *t;
141         struct bkey_packed *k;
142         struct btree_nr_keys nr = { 0 };
143
144         for_each_bset(b, t)
145                 for (k = btree_bkey_first(b, t);
146                      k != btree_bkey_last(b, t);
147                      k = bkey_next(k))
148                         if (!bkey_whiteout(k))
149                                 btree_keys_account_key_add(&nr, t - b->set, k);
150
151         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
152 }
153
154 static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
155                                            struct btree *b,
156                                            struct bkey_packed *k)
157 {
158         const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b);
159
160         bkey_unpack_key(b, k);
161
162         if (n &&
163             keys_out_of_order(b, k, n, iter->is_extents)) {
164                 struct bkey ku = bkey_unpack_key(b, k);
165                 struct bkey nu = bkey_unpack_key(b, n);
166                 char buf1[80], buf2[80];
167
168                 bch_dump_btree_node(b);
169                 bch_bkey_to_text(buf1, sizeof(buf1), &ku);
170                 bch_bkey_to_text(buf2, sizeof(buf2), &nu);
171                 panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
172         }
173 }
174
175 void bch_btree_node_iter_verify(struct btree_node_iter *iter,
176                                 struct btree *b)
177 {
178         struct btree_node_iter_set *set;
179         struct bset_tree *t;
180         struct bkey_packed *k, *first;
181
182         BUG_ON(iter->used > MAX_BSETS);
183
184         if (!iter->used)
185                 return;
186
187         btree_node_iter_for_each(iter, set) {
188                 k = __btree_node_offset_to_key(b, set->k);
189                 t = bch_bkey_to_bset(b, k);
190
191                 BUG_ON(__btree_node_offset_to_key(b, set->end) !=
192                        btree_bkey_last(b, t));
193
194                 BUG_ON(set + 1 < iter->data + iter->used &&
195                        btree_node_iter_cmp(iter, b, set[0], set[1]) > 0);
196         }
197
198         first = __btree_node_offset_to_key(b, iter->data[0].k);
199
200         for_each_bset(b, t)
201                 if (bch_btree_node_iter_bset_pos(iter, b, t) ==
202                     btree_bkey_last(b, t) &&
203                     (k = bkey_prev_all(b, t, btree_bkey_last(b, t))))
204                         BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
205                                                      k, first) > 0);
206 }
207
208 void bch_verify_key_order(struct btree *b,
209                           struct btree_node_iter *iter,
210                           struct bkey_packed *where)
211 {
212         struct bset_tree *t = bch_bkey_to_bset(b, where);
213         struct bkey_packed *k, *prev;
214         struct bkey uk, uw = bkey_unpack_key(b, where);
215
216         k = bkey_prev_all(b, t, where);
217         if (k &&
218             keys_out_of_order(b, k, where, iter->is_extents)) {
219                 char buf1[100], buf2[100];
220
221                 bch_dump_btree_node(b);
222                 uk = bkey_unpack_key(b, k);
223                 bch_bkey_to_text(buf1, sizeof(buf1), &uk);
224                 bch_bkey_to_text(buf2, sizeof(buf2), &uw);
225                 panic("out of order with prev:\n%s\n%s\n",
226                       buf1, buf2);
227         }
228
229         k = bkey_next(where);
230         BUG_ON(k != btree_bkey_last(b, t) &&
231                keys_out_of_order(b, where, k, iter->is_extents));
232
233         for_each_bset(b, t) {
234                 if (where >= btree_bkey_first(b, t) ||
235                     where < btree_bkey_last(b, t))
236                         continue;
237
238                 k = bch_btree_node_iter_bset_pos(iter, b, t);
239
240                 if (k == btree_bkey_last(b, t))
241                         k = bkey_prev_all(b, t, k);
242
243                 while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
244                        (prev = bkey_prev_all(b, t, k)))
245                         k = prev;
246
247                 for (;
248                      k != btree_bkey_last(b, t);
249                      k = bkey_next(k)) {
250                         uk = bkey_unpack_key(b, k);
251
252                         if (iter->is_extents) {
253                                 BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
254                                          bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
255                         } else {
256                                 BUG_ON(!bkey_cmp(uw.p, uk.p) &&
257                                        !bkey_deleted(&uk));
258                         }
259
260                         if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
261                                 break;
262                 }
263         }
264 }
265
266 #else
267
268 static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
269                                            struct btree *b,
270                                            struct bkey_packed *k) {}
271
272 #endif
273
274 /* Auxiliary search trees */
275
276 #define BFLOAT_FAILED_UNPACKED  (U8_MAX - 0)
277 #define BFLOAT_FAILED_PREV      (U8_MAX - 1)
278 #define BFLOAT_FAILED_OVERFLOW  (U8_MAX - 2)
279 #define BFLOAT_FAILED           (U8_MAX - 2)
280
281 #define KEY_WORDS               BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
282
283 struct bkey_float {
284         u8              exponent;
285         u8              key_offset;
286         union {
287                 u32     mantissa32;
288         struct {
289                 u16     mantissa16;
290                 u16     _pad;
291         };
292         };
293 } __packed;
294
295 #define BFLOAT_32BIT_NR         32U
296
297 static unsigned bkey_float_byte_offset(unsigned idx)
298 {
299         int d = (idx - BFLOAT_32BIT_NR) << 1;
300
301         d &= ~(d >> 31);
302
303         return idx * 6 - d;
304 }
305
306 struct ro_aux_tree {
307         struct bkey_float       _d[0];
308 };
309
310 struct rw_aux_tree {
311         u16             offset;
312         struct bpos     k;
313 };
314
315 /*
316  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
317  * it used to be 64, but I realized the lookup code would touch slightly less
318  * memory if it was 128.
319  *
320  * It definites the number of bytes (in struct bset) per struct bkey_float in
321  * the auxiliar search tree - when we're done searching the bset_float tree we
322  * have this many bytes left that we do a linear search over.
323  *
324  * Since (after level 5) every level of the bset_tree is on a new cacheline,
325  * we're touching one fewer cacheline in the bset tree in exchange for one more
326  * cacheline in the linear search - but the linear search might stop before it
327  * gets to the second cacheline.
328  */
329
330 #define BSET_CACHELINE          128
331
332 /* Space required for the btree node keys */
333 static inline size_t btree_keys_bytes(struct btree *b)
334 {
335         return PAGE_SIZE << b->page_order;
336 }
337
338 static inline size_t btree_keys_cachelines(struct btree *b)
339 {
340         return btree_keys_bytes(b) / BSET_CACHELINE;
341 }
342
343 static inline size_t btree_aux_data_bytes(struct btree *b)
344 {
345         return btree_keys_cachelines(b) * 8;
346 }
347
348 static inline size_t btree_aux_data_u64s(struct btree *b)
349 {
350         return btree_aux_data_bytes(b) / sizeof(u64);
351 }
352
353 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
354 {
355         BUG_ON(t->aux_data_offset == U16_MAX);
356
357         switch (bset_aux_tree_type(t)) {
358         case BSET_NO_AUX_TREE:
359                 return t->aux_data_offset;
360         case BSET_RO_AUX_TREE:
361                 return t->aux_data_offset +
362                         DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
363                                      sizeof(u8) * t->size, 8);
364         case BSET_RW_AUX_TREE:
365                 return t->aux_data_offset +
366                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
367         default:
368                 BUG();
369         }
370 }
371
372 static unsigned bset_aux_tree_buf_start(const struct btree *b,
373                                         const struct bset_tree *t)
374 {
375         return t == b->set
376                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
377                 : bset_aux_tree_buf_end(t - 1);
378 }
379
380 static void *__aux_tree_base(const struct btree *b,
381                              const struct bset_tree *t)
382 {
383         return b->aux_data + t->aux_data_offset * 8;
384 }
385
386 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
387                                             const struct bset_tree *t)
388 {
389         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
390
391         return __aux_tree_base(b, t);
392 }
393
394 static u8 *ro_aux_tree_prev(const struct btree *b,
395                             const struct bset_tree *t)
396 {
397         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
398
399         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
400 }
401
402 static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
403                                          unsigned idx)
404 {
405         return (void *) b + bkey_float_byte_offset(idx);
406 }
407
408 static struct bkey_float *bkey_float(const struct btree *b,
409                                      const struct bset_tree *t,
410                                      unsigned idx)
411 {
412         return bkey_float_get(ro_aux_tree_base(b, t), idx);
413 }
414
415 static void bset_aux_tree_verify(struct btree *b)
416 {
417 #ifdef CONFIG_BCACHE_DEBUG
418         struct bset_tree *t;
419
420         for_each_bset(b, t) {
421                 if (t->aux_data_offset == U16_MAX)
422                         continue;
423
424                 BUG_ON(t != b->set &&
425                        t[-1].aux_data_offset == U16_MAX);
426
427                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
428                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
429                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
430         }
431 #endif
432 }
433
434 /* Memory allocation */
435
436 void bch_btree_keys_free(struct btree *b)
437 {
438         vfree(b->aux_data);
439         b->aux_data = NULL;
440 }
441
442 int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
443 {
444         b->page_order   = page_order;
445         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
446                                     PAGE_KERNEL_EXEC);
447         if (!b->aux_data)
448                 return -ENOMEM;
449
450         return 0;
451 }
452
453 void bch_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
454 {
455         unsigned i;
456
457         b->nsets                = 0;
458         memset(&b->nr, 0, sizeof(b->nr));
459 #ifdef CONFIG_BCACHE_DEBUG
460         b->expensive_debug_checks = expensive_debug_checks;
461 #endif
462         for (i = 0; i < MAX_BSETS; i++)
463                 b->set[i].data_offset = U16_MAX;
464
465         bch_bset_set_no_aux_tree(b, b->set);
466 }
467
468 /* Binary tree stuff for auxiliary search trees */
469
470 /*
471  * Cacheline/offset <-> bkey pointer arithmetic:
472  *
473  * t->tree is a binary search tree in an array; each node corresponds to a key
474  * in one cacheline in t->set (BSET_CACHELINE bytes).
475  *
476  * This means we don't have to store the full index of the key that a node in
477  * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and
478  * then bkey_float->m gives us the offset within that cacheline, in units of 8
479  * bytes.
480  *
481  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
482  * make this work.
483  *
484  * To construct the bfloat for an arbitrary key we need to know what the key
485  * immediately preceding it is: we have to check if the two keys differ in the
486  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
487  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
488  */
489
490 static inline void *bset_cacheline(const struct btree *b,
491                                    const struct bset_tree *t,
492                                    unsigned cacheline)
493 {
494         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
495                                    L1_CACHE_BYTES) +
496                 cacheline * BSET_CACHELINE;
497 }
498
499 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
500                                              const struct bset_tree *t,
501                                              unsigned cacheline,
502                                              unsigned offset)
503 {
504         return bset_cacheline(b, t, cacheline) + offset * 8;
505 }
506
507 static unsigned bkey_to_cacheline(const struct btree *b,
508                                   const struct bset_tree *t,
509                                   const struct bkey_packed *k)
510 {
511         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
512 }
513
514 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
515                                           const struct bset_tree *t,
516                                           unsigned cacheline,
517                                           const struct bkey_packed *k)
518 {
519         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
520 }
521
522 static unsigned bkey_to_cacheline_offset(const struct btree *b,
523                                          const struct bset_tree *t,
524                                          unsigned cacheline,
525                                          const struct bkey_packed *k)
526 {
527         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
528
529         EBUG_ON(m > U8_MAX);
530         return m;
531 }
532
533 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
534                                                const struct bset_tree *t,
535                                                unsigned j)
536 {
537         return cacheline_to_bkey(b, t,
538                         __eytzinger_to_inorder(j, t->size, t->extra),
539                         bkey_float(b, t, j)->key_offset);
540 }
541
542 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
543                                              const struct bset_tree *t,
544                                              unsigned j)
545 {
546         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
547
548         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
549 }
550
551 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
552                                        const struct bset_tree *t)
553 {
554         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
555
556         return __aux_tree_base(b, t);
557 }
558
559 /*
560  * For the write set - the one we're currently inserting keys into - we don't
561  * maintain a full search tree, we just keep a simple lookup table in t->prev.
562  */
563 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
564                                           struct bset_tree *t,
565                                           unsigned j)
566 {
567         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
568 }
569
570 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
571                             unsigned j, struct bkey_packed *k)
572 {
573         BUG_ON(k >= btree_bkey_last(b, t));
574
575         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
576                 .offset = __btree_node_key_to_offset(b, k),
577                 .k      = bkey_unpack_pos(b, k),
578         };
579 }
580
581 static void bch_bset_verify_rw_aux_tree(struct btree *b,
582                                         struct bset_tree *t)
583 {
584         struct bkey_packed *k = btree_bkey_first(b, t);
585         unsigned j = 0;
586
587         if (!btree_keys_expensive_checks(b))
588                 return;
589
590         BUG_ON(bset_has_ro_aux_tree(t));
591
592         if (!bset_has_rw_aux_tree(t))
593                 return;
594
595         BUG_ON(t->size < 1);
596         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
597
598         goto start;
599         while (1) {
600                 if (rw_aux_to_bkey(b, t, j) == k) {
601                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
602                                         bkey_unpack_pos(b, k)));
603 start:
604                         if (++j == t->size)
605                                 break;
606
607                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
608                                rw_aux_tree(b, t)[j - 1].offset);
609                 }
610
611                 k = bkey_next(k);
612                 BUG_ON(k >= btree_bkey_last(b, t));
613         }
614 }
615
616 /* returns idx of first entry >= offset: */
617 static unsigned rw_aux_tree_bsearch(struct btree *b,
618                                     struct bset_tree *t,
619                                     unsigned offset)
620 {
621         unsigned l = 0, r = t->size;
622
623         BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
624
625         while (l < r) {
626                 unsigned m = (l + r) >> 1;
627
628                 if (rw_aux_tree(b, t)[m].offset < offset)
629                         l = m + 1;
630                 else
631                         r = m;
632         }
633
634         BUG_ON(l < t->size &&
635                rw_aux_tree(b, t)[l].offset < offset);
636         BUG_ON(l &&
637                rw_aux_tree(b, t)[l - 1].offset >= offset);
638
639         BUG_ON(l > r);
640         BUG_ON(l > t->size);
641
642         return l;
643 }
644
645 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
646                                        unsigned idx)
647 {
648         return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
649 }
650
651 static inline void bfloat_mantissa_set(struct bkey_float *f,
652                                        unsigned idx, unsigned mantissa)
653 {
654         if (idx < BFLOAT_32BIT_NR)
655                 f->mantissa32 = mantissa;
656         else
657                 f->mantissa16 = mantissa;
658 }
659
660 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
661                                      const struct bkey_float *f,
662                                      unsigned idx)
663 {
664         u64 v;
665
666         EBUG_ON(!bkey_packed(k));
667
668         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
669
670         /*
671          * In little endian, we're shifting off low bits (and then the bits we
672          * want are at the low end), in big endian we're shifting off high bits
673          * (and then the bits we want are at the high end, so we shift them
674          * back down):
675          */
676 #ifdef __LITTLE_ENDIAN
677         v >>= f->exponent & 7;
678 #else
679         v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
680 #endif
681         return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
682 }
683
684 static void make_bfloat(struct btree *b, struct bset_tree *t,
685                         unsigned j,
686                         struct bkey_packed *min_key,
687                         struct bkey_packed *max_key)
688 {
689         struct bkey_float *f = bkey_float(b, t, j);
690         struct bkey_packed *m = tree_to_bkey(b, t, j);
691         struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
692         struct bkey_packed *l, *r;
693         unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
694         unsigned mantissa;
695         int shift, exponent;
696
697         EBUG_ON(bkey_next(p) != m);
698
699         if (is_power_of_2(j)) {
700                 l = min_key;
701
702                 if (!l->u64s) {
703                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
704                                 struct bkey_i tmp;
705
706                                 bkey_init(&tmp.k);
707                                 tmp.k.p = b->data->min_key;
708                                 bkey_copy(l, &tmp);
709                         }
710                 }
711         } else {
712                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
713
714                 EBUG_ON(m < l);
715         }
716
717         if (is_power_of_2(j + 1)) {
718                 r = max_key;
719
720                 if (!r->u64s) {
721                         if (!bkey_pack_pos(r, t->max_key, b)) {
722                                 struct bkey_i tmp;
723
724                                 bkey_init(&tmp.k);
725                                 tmp.k.p = t->max_key;
726                                 bkey_copy(r, &tmp);
727                         }
728                 }
729         } else {
730                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
731
732                 EBUG_ON(m > r);
733         }
734
735         /*
736          * for failed bfloats, the lookup code falls back to comparing against
737          * the original key.
738          */
739
740         if (!bkey_packed(l) || !bkey_packed(r) ||
741             !bkey_packed(p) || !bkey_packed(m)) {
742                 f->exponent = BFLOAT_FAILED_UNPACKED;
743                 return;
744         }
745
746         /*
747          * The greatest differing bit of l and r is the first bit we must
748          * include in the bfloat mantissa we're creating in order to do
749          * comparisons - that bit always becomes the high bit of
750          * bfloat->mantissa, and thus the exponent we're calculating here is
751          * the position of what will become the low bit in bfloat->mantissa:
752          *
753          * Note that this may be negative - we may be running off the low end
754          * of the key: we handle this later:
755          */
756         exponent = (int) bkey_greatest_differing_bit(b, l, r) - (bits - 1);
757
758         /*
759          * Then we calculate the actual shift value, from the start of the key
760          * (k->_data), to get the key bits starting at exponent:
761          */
762 #ifdef __LITTLE_ENDIAN
763         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
764
765         EBUG_ON(shift + bits > b->format.key_u64s * 64);
766 #else
767         shift = high_bit_offset +
768                 b->nr_key_bits -
769                 exponent -
770                 bits;
771
772         EBUG_ON(shift < KEY_PACKED_BITS_START);
773 #endif
774         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
775
776         f->exponent = shift;
777         mantissa = bkey_mantissa(m, f, j);
778
779         /*
780          * If we've got garbage bits, set them to all 1s - it's legal for the
781          * bfloat to compare larger than the original key, but not smaller:
782          */
783         if (exponent < 0)
784                 mantissa |= ~(~0U << -exponent);
785
786         bfloat_mantissa_set(f, j, mantissa);
787
788         /*
789          * The bfloat must be able to tell its key apart from the previous key -
790          * if its key and the previous key don't differ in the required bits,
791          * flag as failed - unless the keys are actually equal, in which case
792          * we aren't required to return a specific one:
793          */
794         if (exponent > 0 &&
795             bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
796             bkey_cmp_packed(b, p, m)) {
797                 f->exponent = BFLOAT_FAILED_PREV;
798                 return;
799         }
800
801         /*
802          * f->mantissa must compare >= the original key - for transitivity with
803          * the comparison in bset_search_tree. If we're dropping set bits,
804          * increment it:
805          */
806         if (exponent > (int) bkey_ffs(b, m)) {
807                 if (j < BFLOAT_32BIT_NR
808                     ? f->mantissa32 == U32_MAX
809                     : f->mantissa16 == U16_MAX)
810                         f->exponent = BFLOAT_FAILED_OVERFLOW;
811
812                 if (j < BFLOAT_32BIT_NR)
813                         f->mantissa32++;
814                 else
815                         f->mantissa16++;
816         }
817 }
818
819 /* bytes remaining - only valid for last bset: */
820 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
821 {
822         bset_aux_tree_verify(b);
823
824         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
825 }
826
827 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
828 {
829         unsigned bytes = __bset_tree_capacity(b, t);
830
831         if (bytes < 7 * BFLOAT_32BIT_NR)
832                 return bytes / 7;
833
834         bytes -= 7 * BFLOAT_32BIT_NR;
835
836         return BFLOAT_32BIT_NR + bytes / 5;
837 }
838
839 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
840 {
841         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
842 }
843
844 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
845 {
846         struct bkey_packed *k;
847
848         t->size = 1;
849         t->extra = BSET_RW_AUX_TREE_VAL;
850         rw_aux_tree(b, t)[0].offset =
851                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
852
853         for (k = btree_bkey_first(b, t);
854              k != btree_bkey_last(b, t);
855              k = bkey_next(k)) {
856                 if (t->size == bset_rw_tree_capacity(b, t))
857                         break;
858
859                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
860                     L1_CACHE_BYTES)
861                         rw_aux_tree_set(b, t, t->size++, k);
862         }
863 }
864
865 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
866 {
867         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
868         struct bkey_packed min_key, max_key;
869         unsigned j, cacheline = 1;
870
871         /* signal to make_bfloat() that they're uninitialized: */
872         min_key.u64s = max_key.u64s = 0;
873
874         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
875                       bset_ro_tree_capacity(b, t));
876 retry:
877         if (t->size < 2) {
878                 t->size = 0;
879                 t->extra = BSET_NO_AUX_TREE_VAL;
880                 return;
881         }
882
883         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
884
885         /* First we figure out where the first key in each cacheline is */
886         eytzinger_for_each(j, t->size) {
887                 while (bkey_to_cacheline(b, t, k) < cacheline)
888                         prev = k, k = bkey_next(k);
889
890                 if (k >= btree_bkey_last(b, t)) {
891                         t->size--;
892                         goto retry;
893                 }
894
895                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
896                 bkey_float(b, t, j)->key_offset =
897                         bkey_to_cacheline_offset(b, t, cacheline++, k);
898
899                 BUG_ON(tree_to_prev_bkey(b, t, j) != prev);
900                 BUG_ON(tree_to_bkey(b, t, j) != k);
901         }
902
903         while (bkey_next(k) != btree_bkey_last(b, t))
904                 k = bkey_next(k);
905
906         t->max_key = bkey_unpack_pos(b, k);
907
908         /* Then we build the tree */
909         eytzinger_for_each(j, t->size)
910                 make_bfloat(b, t, j, &min_key, &max_key);
911 }
912
913 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
914 {
915         struct bset_tree *i;
916
917         for (i = b->set; i != t; i++)
918                 BUG_ON(bset_has_rw_aux_tree(i));
919
920         bch_bset_set_no_aux_tree(b, t);
921
922         /* round up to next cacheline: */
923         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
924                                       SMP_CACHE_BYTES / sizeof(u64));
925
926         bset_aux_tree_verify(b);
927 }
928
929 void bch_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
930                              bool writeable)
931 {
932         if (writeable
933             ? bset_has_rw_aux_tree(t)
934             : bset_has_ro_aux_tree(t))
935                 return;
936
937         bset_alloc_tree(b, t);
938
939         if (!__bset_tree_capacity(b, t))
940                 return;
941
942         if (writeable)
943                 __build_rw_aux_tree(b, t);
944         else
945                 __build_ro_aux_tree(b, t);
946
947         bset_aux_tree_verify(b);
948 }
949
950 void bch_bset_init_first(struct btree *b, struct bset *i)
951 {
952         struct bset_tree *t;
953
954         BUG_ON(b->nsets);
955
956         memset(i, 0, sizeof(*i));
957         get_random_bytes(&i->seq, sizeof(i->seq));
958         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
959
960         t = &b->set[b->nsets++];
961         set_btree_bset(b, t, i);
962 }
963
964 void bch_bset_init_next(struct btree *b, struct bset *i)
965 {
966         struct bset_tree *t;
967
968         BUG_ON(b->nsets >= MAX_BSETS);
969
970         memset(i, 0, sizeof(*i));
971         i->seq = btree_bset_first(b)->seq;
972         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
973
974         t = &b->set[b->nsets++];
975         set_btree_bset(b, t, i);
976 }
977
978 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
979                                        struct bkey_packed *k)
980 {
981         struct bkey_packed *p;
982         unsigned offset;
983         int j;
984
985         EBUG_ON(k < btree_bkey_first(b, t) ||
986                 k > btree_bkey_last(b, t));
987
988         if (k == btree_bkey_first(b, t))
989                 return NULL;
990
991         switch (bset_aux_tree_type(t)) {
992         case BSET_NO_AUX_TREE:
993                 p = btree_bkey_first(b, t);
994                 break;
995         case BSET_RO_AUX_TREE:
996                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
997
998                 do {
999                         p = j ? tree_to_bkey(b, t,
1000                                         __inorder_to_eytzinger(j--,
1001                                                         t->size, t->extra))
1002                               : btree_bkey_first(b, t);
1003                 } while (p >= k);
1004                 break;
1005         case BSET_RW_AUX_TREE:
1006                 offset = __btree_node_key_to_offset(b, k);
1007                 j = rw_aux_tree_bsearch(b, t, offset);
1008                 p = j ? rw_aux_to_bkey(b, t, j - 1)
1009                       : btree_bkey_first(b, t);
1010                 break;
1011         }
1012
1013         return p;
1014 }
1015
1016 struct bkey_packed *bkey_prev_all(struct btree *b, struct bset_tree *t,
1017                                   struct bkey_packed *k)
1018 {
1019         struct bkey_packed *p;
1020
1021         p = __bkey_prev(b, t, k);
1022         if (!p)
1023                 return NULL;
1024
1025         while (bkey_next(p) != k)
1026                 p = bkey_next(p);
1027
1028         return p;
1029 }
1030
1031 struct bkey_packed *bkey_prev(struct btree *b, struct bset_tree *t,
1032                               struct bkey_packed *k)
1033 {
1034         while (1) {
1035                 struct bkey_packed *p, *i, *ret = NULL;
1036
1037                 p = __bkey_prev(b, t, k);
1038                 if (!p)
1039                         return NULL;
1040
1041                 for (i = p; i != k; i = bkey_next(i))
1042                         if (!bkey_deleted(i))
1043                                 ret = i;
1044
1045                 if (ret)
1046                         return ret;
1047
1048                 k = p;
1049         }
1050 }
1051
1052 /* Insert */
1053
1054 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1055                                             struct bset_tree *t,
1056                                             struct bkey_packed *k)
1057 {
1058         unsigned offset = __btree_node_key_to_offset(b, k);
1059         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1060
1061         if (j < t->size &&
1062             rw_aux_tree(b, t)[j].offset == offset)
1063                 rw_aux_tree_set(b, t, j, k);
1064
1065         bch_bset_verify_rw_aux_tree(b, t);
1066 }
1067
1068 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1069                                             struct bset_tree *t,
1070                                             struct bkey_packed *k)
1071 {
1072         struct bkey_packed min_key, max_key;
1073         unsigned inorder, j;
1074
1075         BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1076
1077         /* signal to make_bfloat() that they're uninitialized: */
1078         min_key.u64s = max_key.u64s = 0;
1079
1080         if (bkey_next(k) == btree_bkey_last(b, t)) {
1081                 t->max_key = bkey_unpack_pos(b, k);
1082
1083                 for (j = 1; j < t->size; j = j * 2 + 1)
1084                         make_bfloat(b, t, j, &min_key, &max_key);
1085         }
1086
1087         inorder = bkey_to_cacheline(b, t, k);
1088
1089         if (inorder &&
1090             inorder < t->size) {
1091                 j = __inorder_to_eytzinger(inorder, t->size, t->extra);
1092
1093                 if (k == tree_to_bkey(b, t, j)) {
1094                         /* Fix the node this key corresponds to */
1095                         make_bfloat(b, t, j, &min_key, &max_key);
1096
1097                         /* Children for which this key is the right boundary */
1098                         for (j = eytzinger_left_child(j);
1099                              j < t->size;
1100                              j = eytzinger_right_child(j))
1101                                 make_bfloat(b, t, j, &min_key, &max_key);
1102                 }
1103         }
1104
1105         if (inorder + 1 < t->size) {
1106                 j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra);
1107
1108                 if (k == tree_to_prev_bkey(b, t, j)) {
1109                         make_bfloat(b, t, j, &min_key, &max_key);
1110
1111                         /* Children for which this key is the left boundary */
1112                         for (j = eytzinger_right_child(j);
1113                              j < t->size;
1114                              j = eytzinger_left_child(j))
1115                                 make_bfloat(b, t, j, &min_key, &max_key);
1116                 }
1117         }
1118 }
1119
1120 /**
1121  * bch_bset_fix_invalidated_key() - given an existing  key @k that has been
1122  * modified, fix any auxiliary search tree by remaking all the nodes in the
1123  * auxiliary search tree that @k corresponds to
1124  */
1125 void bch_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
1126                                   struct bkey_packed *k)
1127 {
1128         switch (bset_aux_tree_type(t)) {
1129         case BSET_NO_AUX_TREE:
1130                 break;
1131         case BSET_RO_AUX_TREE:
1132                 ro_aux_tree_fix_invalidated_key(b, t, k);
1133                 break;
1134         case BSET_RW_AUX_TREE:
1135                 rw_aux_tree_fix_invalidated_key(b, t, k);
1136                 break;
1137         }
1138 }
1139
1140 static void bch_bset_fix_lookup_table(struct btree *b,
1141                                       struct bset_tree *t,
1142                                       struct bkey_packed *_where,
1143                                       unsigned clobber_u64s,
1144                                       unsigned new_u64s)
1145 {
1146         int shift = new_u64s - clobber_u64s;
1147         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1148
1149         BUG_ON(bset_has_ro_aux_tree(t));
1150
1151         if (!bset_has_rw_aux_tree(t))
1152                 return;
1153
1154         l = rw_aux_tree_bsearch(b, t, where);
1155
1156         /* l is first >= than @where */
1157
1158         BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
1159         BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
1160
1161         if (!l) /* never delete first entry */
1162                 l++;
1163         else if (l < t->size &&
1164                  where < t->end_offset &&
1165                  rw_aux_tree(b, t)[l].offset == where)
1166                 rw_aux_tree_set(b, t, l++, _where);
1167
1168         /* l now > where */
1169
1170         for (j = l;
1171              j < t->size &&
1172              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1173              j++)
1174                 ;
1175
1176         if (j < t->size &&
1177             rw_aux_tree(b, t)[j].offset + shift ==
1178             rw_aux_tree(b, t)[l - 1].offset)
1179                 j++;
1180
1181         memmove(&rw_aux_tree(b, t)[l],
1182                 &rw_aux_tree(b, t)[j],
1183                 (void *) &rw_aux_tree(b, t)[t->size] -
1184                 (void *) &rw_aux_tree(b, t)[j]);
1185         t->size -= j - l;
1186
1187         for (j = l; j < t->size; j++)
1188                rw_aux_tree(b, t)[j].offset += shift;
1189
1190         BUG_ON(l < t->size &&
1191                rw_aux_tree(b, t)[l].offset ==
1192                rw_aux_tree(b, t)[l - 1].offset);
1193
1194         if (t->size < bset_rw_tree_capacity(b, t) &&
1195             (l < t->size
1196              ? rw_aux_tree(b, t)[l].offset
1197              : t->end_offset) -
1198             rw_aux_tree(b, t)[l - 1].offset >
1199             L1_CACHE_BYTES / sizeof(u64)) {
1200                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1201                 struct bkey_packed *end = l < t->size
1202                         ? rw_aux_to_bkey(b, t, l)
1203                         : btree_bkey_last(b, t);
1204                 struct bkey_packed *k = start;
1205
1206                 while (1) {
1207                         k = bkey_next(k);
1208                         if (k == end)
1209                                 break;
1210
1211                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1212                                 memmove(&rw_aux_tree(b, t)[l + 1],
1213                                         &rw_aux_tree(b, t)[l],
1214                                         (void *) &rw_aux_tree(b, t)[t->size] -
1215                                         (void *) &rw_aux_tree(b, t)[l]);
1216                                 t->size++;
1217                                 rw_aux_tree_set(b, t, l, k);
1218                                 break;
1219                         }
1220                 }
1221         }
1222
1223         bch_bset_verify_rw_aux_tree(b, t);
1224         bset_aux_tree_verify(b);
1225 }
1226
1227 void bch_bset_insert(struct btree *b,
1228                     struct btree_node_iter *iter,
1229                     struct bkey_packed *where,
1230                     struct bkey_i *insert,
1231                     unsigned clobber_u64s)
1232 {
1233         struct bkey_format *f = &b->format;
1234         struct bset_tree *t = bset_tree_last(b);
1235         struct bkey_packed packed, *src = bkey_to_packed(insert);
1236
1237         bch_bset_verify_rw_aux_tree(b, t);
1238
1239         if (bkey_pack_key(&packed, &insert->k, f))
1240                 src = &packed;
1241
1242         if (!bkey_whiteout(&insert->k))
1243                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1244
1245         if (src->u64s != clobber_u64s) {
1246                 u64 *src_p = where->_data + clobber_u64s;
1247                 u64 *dst_p = where->_data + src->u64s;
1248
1249                 BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1250                        (int) clobber_u64s - src->u64s);
1251
1252                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1253                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1254                 set_btree_bset_end(b, t);
1255         }
1256
1257         memcpy_u64s(where, src,
1258                     bkeyp_key_u64s(f, src));
1259         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1260                     bkeyp_val_u64s(f, src));
1261
1262         bch_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1263
1264         bch_verify_key_order(b, iter, where);
1265         bch_verify_btree_nr_keys(b);
1266 }
1267
1268 void bch_bset_delete(struct btree *b,
1269                      struct bkey_packed *where,
1270                      unsigned clobber_u64s)
1271 {
1272         struct bset_tree *t = bset_tree_last(b);
1273         u64 *src_p = where->_data + clobber_u64s;
1274         u64 *dst_p = where->_data;
1275
1276         bch_bset_verify_rw_aux_tree(b, t);
1277
1278         BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1279
1280         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1281         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1282         set_btree_bset_end(b, t);
1283
1284         bch_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1285 }
1286
1287 /* Lookup */
1288
1289 __flatten
1290 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1291                                 struct bset_tree *t,
1292                                 struct bpos search,
1293                                 const struct bkey_packed *packed_search)
1294 {
1295         unsigned l = 0, r = t->size;
1296
1297         while (l + 1 != r) {
1298                 unsigned m = (l + r) >> 1;
1299
1300                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
1301                         l = m;
1302                 else
1303                         r = m;
1304         }
1305
1306         return rw_aux_to_bkey(b, t, l);
1307 }
1308
1309 noinline
1310 static int bset_search_tree_slowpath(const struct btree *b,
1311                                 struct bset_tree *t, struct bpos *search,
1312                                 const struct bkey_packed *packed_search,
1313                                 unsigned n)
1314 {
1315         return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
1316                                  packed_search, search) < 0;
1317 }
1318
1319 __flatten
1320 static struct bkey_packed *bset_search_tree(const struct btree *b,
1321                                 struct bset_tree *t,
1322                                 struct bpos search,
1323                                 const struct bkey_packed *packed_search)
1324 {
1325         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1326         struct bkey_float *f = bkey_float_get(base, 1);
1327         void *p;
1328         unsigned inorder, n = 1;
1329
1330         while (1) {
1331                 if (likely(n << 4 < t->size)) {
1332                         p = bkey_float_get(base, n << 4);
1333                         prefetch(p);
1334                 } else if (n << 3 < t->size) {
1335                         inorder = __eytzinger_to_inorder(n, t->size, t->extra);
1336                         p = bset_cacheline(b, t, inorder);
1337 #ifdef CONFIG_X86_64
1338                         asm(".intel_syntax noprefix;"
1339                             "prefetcht0 [%0 - 127 + 64 * 0];"
1340                             "prefetcht0 [%0 - 127 + 64 * 1];"
1341                             "prefetcht0 [%0 - 127 + 64 * 2];"
1342                             "prefetcht0 [%0 - 127 + 64 * 3];"
1343                             ".att_syntax prefix;"
1344                             :
1345                             : "r" (p + 127));
1346 #else
1347                         prefetch(p + L1_CACHE_BYTES * 0);
1348                         prefetch(p + L1_CACHE_BYTES * 1);
1349                         prefetch(p + L1_CACHE_BYTES * 2);
1350                         prefetch(p + L1_CACHE_BYTES * 3);
1351 #endif
1352                 } else if (n >= t->size)
1353                         break;
1354
1355                 f = bkey_float_get(base, n);
1356
1357                 if (packed_search &&
1358                     likely(f->exponent < BFLOAT_FAILED))
1359                         n = n * 2 + (bfloat_mantissa(f, n) <
1360                                      bkey_mantissa(packed_search, f, n));
1361                 else
1362                         n = n * 2 + bset_search_tree_slowpath(b, t,
1363                                                 &search, packed_search, n);
1364         } while (n < t->size);
1365
1366         inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra);
1367
1368         /*
1369          * n would have been the node we recursed to - the low bit tells us if
1370          * we recursed left or recursed right.
1371          */
1372         if (n & 1) {
1373                 return cacheline_to_bkey(b, t, inorder, f->key_offset);
1374         } else {
1375                 if (--inorder) {
1376                         n = eytzinger_prev(n >> 1, t->size);
1377                         f = bkey_float_get(base, n);
1378                         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1379                 } else
1380                         return btree_bkey_first(b, t);
1381         }
1382 }
1383
1384 /*
1385  * Returns the first key greater than or equal to @search
1386  */
1387 __always_inline __flatten
1388 static struct bkey_packed *bch_bset_search(struct btree *b,
1389                                 struct bset_tree *t,
1390                                 struct bpos search,
1391                                 struct bkey_packed *packed_search,
1392                                 const struct bkey_packed *lossy_packed_search,
1393                                 bool strictly_greater)
1394 {
1395         struct bkey_packed *m;
1396
1397         /*
1398          * First, we search for a cacheline, then lastly we do a linear search
1399          * within that cacheline.
1400          *
1401          * To search for the cacheline, there's three different possibilities:
1402          *  * The set is too small to have a search tree, so we just do a linear
1403          *    search over the whole set.
1404          *  * The set is the one we're currently inserting into; keeping a full
1405          *    auxiliary search tree up to date would be too expensive, so we
1406          *    use a much simpler lookup table to do a binary search -
1407          *    bset_search_write_set().
1408          *  * Or we use the auxiliary search tree we constructed earlier -
1409          *    bset_search_tree()
1410          */
1411
1412         switch (bset_aux_tree_type(t)) {
1413         case BSET_NO_AUX_TREE:
1414                 m = btree_bkey_first(b, t);
1415                 break;
1416         case BSET_RW_AUX_TREE:
1417                 m = bset_search_write_set(b, t, search, lossy_packed_search);
1418                 break;
1419         case BSET_RO_AUX_TREE:
1420                 /*
1421                  * Each node in the auxiliary search tree covers a certain range
1422                  * of bits, and keys above and below the set it covers might
1423                  * differ outside those bits - so we have to special case the
1424                  * start and end - handle that here:
1425                  */
1426
1427                 if (bkey_cmp(search, t->max_key) > 0)
1428                         return btree_bkey_last(b, t);
1429
1430                 m = bset_search_tree(b, t, search, lossy_packed_search);
1431                 break;
1432         }
1433
1434         if (lossy_packed_search)
1435                 while (m != btree_bkey_last(b, t) &&
1436                        !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
1437                                                     m, strictly_greater))
1438                         m = bkey_next(m);
1439
1440         if (!packed_search)
1441                 while (m != btree_bkey_last(b, t) &&
1442                        !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
1443                         m = bkey_next(m);
1444
1445         if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
1446                 struct bkey_packed *prev = bkey_prev_all(b, t, m);
1447
1448                 BUG_ON(prev &&
1449                        btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
1450                                                    prev, strictly_greater));
1451         }
1452
1453         return m;
1454 }
1455
1456 /* Btree node iterator */
1457
1458 void bch_btree_node_iter_push(struct btree_node_iter *iter,
1459                               struct btree *b,
1460                               const struct bkey_packed *k,
1461                               const struct bkey_packed *end)
1462 {
1463         if (k != end) {
1464                 struct btree_node_iter_set *pos, n =
1465                         ((struct btree_node_iter_set) {
1466                                  __btree_node_key_to_offset(b, k),
1467                                  __btree_node_key_to_offset(b, end)
1468                          });
1469
1470                 btree_node_iter_for_each(iter, pos)
1471                         if (btree_node_iter_cmp(iter, b, n, *pos) <= 0)
1472                                 break;
1473
1474                 memmove(pos + 1, pos,
1475                         (void *) (iter->data + iter->used) - (void *) pos);
1476                 iter->used++;
1477                 *pos = n;
1478         }
1479 }
1480
1481 noinline __flatten __attribute__((cold))
1482 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1483                               struct btree *b, struct bpos search,
1484                               bool strictly_greater, bool is_extents)
1485 {
1486         struct bset_tree *t;
1487
1488         trace_bkey_pack_pos_fail(search);
1489
1490         for_each_bset(b, t)
1491                 __bch_btree_node_iter_push(iter, b,
1492                         bch_bset_search(b, t, search, NULL, NULL,
1493                                         strictly_greater),
1494                         btree_bkey_last(b, t));
1495
1496         bch_btree_node_iter_sort(iter, b);
1497 }
1498
1499 /**
1500  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1501  * given position
1502  *
1503  * Main entry point to the lookup code for individual btree nodes:
1504  *
1505  * NOTE:
1506  *
1507  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1508  * keys. This doesn't matter for most code, but it does matter for lookups.
1509  *
1510  * Some adjacent keys with a string of equal keys:
1511  *      i j k k k k l m
1512  *
1513  * If you search for k, the lookup code isn't guaranteed to return you any
1514  * specific k. The lookup code is conceptually doing a binary search and
1515  * iterating backwards is very expensive so if the pivot happens to land at the
1516  * last k that's what you'll get.
1517  *
1518  * This works out ok, but it's something to be aware of:
1519  *
1520  *  - For non extents, we guarantee that the live key comes last - see
1521  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1522  *    see will only be deleted keys you don't care about.
1523  *
1524  *  - For extents, deleted keys sort last (see the comment at the top of this
1525  *    file). But when you're searching for extents, you actually want the first
1526  *    key strictly greater than your search key - an extent that compares equal
1527  *    to the search key is going to have 0 sectors after the search key.
1528  *
1529  *    But this does mean that we can't just search for
1530  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1531  *    the range we want - if we're unlucky and there's an extent that ends
1532  *    exactly where we searched, then there could be a deleted key at the same
1533  *    position and we'd get that when we search instead of the preceding extent
1534  *    we needed.
1535  *
1536  *    So we've got to search for start_of_range, then after the lookup iterate
1537  *    past any extents that compare equal to the position we searched for.
1538  */
1539 void bch_btree_node_iter_init(struct btree_node_iter *iter,
1540                               struct btree *b, struct bpos search,
1541                               bool strictly_greater, bool is_extents)
1542 {
1543         struct bset_tree *t;
1544         struct bkey_packed p, *packed_search = NULL;
1545
1546         EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
1547         bset_aux_tree_verify(b);
1548
1549         __bch_btree_node_iter_init(iter, is_extents);
1550
1551         //if (bkey_cmp(search, b->curr_max_key) > 0)
1552         //      return;
1553
1554         switch (bkey_pack_pos_lossy(&p, search, b)) {
1555         case BKEY_PACK_POS_EXACT:
1556                 packed_search = &p;
1557                 break;
1558         case BKEY_PACK_POS_SMALLER:
1559                 packed_search = NULL;
1560                 break;
1561         case BKEY_PACK_POS_FAIL:
1562                 btree_node_iter_init_pack_failed(iter, b, search,
1563                                         strictly_greater, is_extents);
1564                 return;
1565         }
1566
1567         for_each_bset(b, t)
1568                 __bch_btree_node_iter_push(iter, b,
1569                                            bch_bset_search(b, t, search,
1570                                                            packed_search, &p,
1571                                                            strictly_greater),
1572                                            btree_bkey_last(b, t));
1573
1574         bch_btree_node_iter_sort(iter, b);
1575 }
1576
1577 void bch_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1578                                          struct btree *b,
1579                                          bool is_extents)
1580 {
1581         struct bset_tree *t;
1582
1583         __bch_btree_node_iter_init(iter, is_extents);
1584
1585         for_each_bset(b, t)
1586                 __bch_btree_node_iter_push(iter, b,
1587                                            btree_bkey_first(b, t),
1588                                            btree_bkey_last(b, t));
1589         bch_btree_node_iter_sort(iter, b);
1590 }
1591
1592 struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1593                                                  struct btree *b,
1594                                                  struct bset_tree *t)
1595 {
1596         struct btree_node_iter_set *set;
1597
1598         BUG_ON(iter->used > MAX_BSETS);
1599
1600         btree_node_iter_for_each(iter, set)
1601                 if (set->end == t->end_offset)
1602                         return __btree_node_offset_to_key(b, set->k);
1603
1604         return btree_bkey_last(b, t);
1605 }
1606
1607 static inline void btree_node_iter_sift(struct btree_node_iter *iter,
1608                                         struct btree *b,
1609                                         unsigned start)
1610 {
1611         unsigned i;
1612
1613         EBUG_ON(iter->used > MAX_BSETS);
1614
1615         for (i = start;
1616              i + 1 < iter->used &&
1617              btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0;
1618              i++)
1619                 swap(iter->data[i], iter->data[i + 1]);
1620 }
1621
1622 static inline void btree_node_iter_sort_two(struct btree_node_iter *iter,
1623                                             struct btree *b,
1624                                             unsigned first)
1625 {
1626         if (btree_node_iter_cmp(iter, b,
1627                                 iter->data[first],
1628                                 iter->data[first + 1]) > 0)
1629                 swap(iter->data[first], iter->data[first + 1]);
1630 }
1631
1632 void bch_btree_node_iter_sort(struct btree_node_iter *iter,
1633                               struct btree *b)
1634 {
1635         EBUG_ON(iter->used > 3);
1636
1637         /* unrolled bubble sort: */
1638
1639         if (iter->used > 2) {
1640                 btree_node_iter_sort_two(iter, b, 0);
1641                 btree_node_iter_sort_two(iter, b, 1);
1642         }
1643
1644         if (iter->used > 1)
1645                 btree_node_iter_sort_two(iter, b, 0);
1646 }
1647 EXPORT_SYMBOL(bch_btree_node_iter_sort);
1648
1649 /**
1650  * bch_btree_node_iter_advance - advance @iter by one key
1651  *
1652  * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
1653  * momentarily have out of order extents.
1654  */
1655 void bch_btree_node_iter_advance(struct btree_node_iter *iter,
1656                                  struct btree *b)
1657 {
1658         struct bkey_packed *k = bch_btree_node_iter_peek_all(iter, b);
1659
1660         iter->data->k += __bch_btree_node_iter_peek_all(iter, b)->u64s;
1661
1662         BUG_ON(iter->data->k > iter->data->end);
1663
1664         if (iter->data->k == iter->data->end) {
1665                 BUG_ON(iter->used == 0);
1666                 iter->data[0] = iter->data[--iter->used];
1667         }
1668
1669         btree_node_iter_sift(iter, b, 0);
1670
1671         bch_btree_node_iter_next_check(iter, b, k);
1672 }
1673
1674 /*
1675  * Expensive:
1676  */
1677 struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter,
1678                                                  struct btree *b)
1679 {
1680         struct bkey_packed *k, *prev = NULL;
1681         struct btree_node_iter_set *set;
1682         struct bset_tree *t;
1683         struct bset_tree *prev_t;
1684         unsigned end;
1685
1686         bch_btree_node_iter_verify(iter, b);
1687
1688         for_each_bset(b, t) {
1689                 k = bkey_prev_all(b, t,
1690                         bch_btree_node_iter_bset_pos(iter, b, t));
1691                 if (k &&
1692                     (!prev || __btree_node_iter_cmp(iter->is_extents, b,
1693                                                     k, prev) > 0)) {
1694                         prev = k;
1695                         prev_t = t;
1696                 }
1697         }
1698
1699         if (!prev)
1700                 return NULL;
1701
1702         /*
1703          * We're manually memmoving instead of just calling sort() to ensure the
1704          * prev we picked ends up in slot 0 - sort won't necessarily put it
1705          * there because of duplicate deleted keys:
1706          */
1707         end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
1708         btree_node_iter_for_each(iter, set)
1709                 if (set->end == end) {
1710                         memmove(&iter->data[1],
1711                                 &iter->data[0],
1712                                 (void *) set - (void *) &iter->data[0]);
1713                         goto out;
1714                 }
1715
1716         memmove(&iter->data[1],
1717                 &iter->data[0],
1718                 (void *) &iter->data[iter->used] - (void *) &iter->data[0]);
1719         iter->used++;
1720 out:
1721         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1722         iter->data[0].end = end;
1723         return prev;
1724 }
1725
1726 struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *iter,
1727                                              struct btree *b)
1728 {
1729         struct bkey_packed *k;
1730
1731         do {
1732                 k = bch_btree_node_iter_prev_all(iter, b);
1733         } while (k && bkey_deleted(k));
1734
1735         return k;
1736 }
1737
1738 struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1739                                                 struct btree *b,
1740                                                 struct bkey *u)
1741 {
1742         struct bkey_packed *k = bch_btree_node_iter_peek(iter, b);
1743
1744         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1745 }
1746 EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack);
1747
1748 /* Mergesort */
1749
1750 void bch_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1751 {
1752         struct bset_tree *t;
1753
1754         for_each_bset(b, t) {
1755                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1756                 size_t j;
1757
1758                 stats->sets[type].nr++;
1759                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1760                         sizeof(u64);
1761
1762                 if (bset_has_ro_aux_tree(t)) {
1763                         stats->floats += t->size - 1;
1764
1765                         for (j = 1; j < t->size; j++)
1766                                 switch (bkey_float(b, t, j)->exponent) {
1767                                 case BFLOAT_FAILED_UNPACKED:
1768                                         stats->failed_unpacked++;
1769                                         break;
1770                                 case BFLOAT_FAILED_PREV:
1771                                         stats->failed_prev++;
1772                                         break;
1773                                 case BFLOAT_FAILED_OVERFLOW:
1774                                         stats->failed_overflow++;
1775                                         break;
1776                                 }
1777                 }
1778         }
1779 }
1780
1781 int bch_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
1782                           char *buf, size_t size)
1783 {
1784         struct bset_tree *t = bch_bkey_to_bset(b, k);
1785         struct bkey_packed *l, *r, *p;
1786         struct bkey uk, up;
1787         char buf1[200], buf2[200];
1788         unsigned j;
1789
1790         if (!size)
1791                 return 0;
1792
1793         if (!bset_has_ro_aux_tree(t))
1794                 goto out;
1795
1796         j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra);
1797         if (j &&
1798             j < t->size &&
1799             k == tree_to_bkey(b, t, j))
1800                 switch (bkey_float(b, t, j)->exponent) {
1801                 case BFLOAT_FAILED_UNPACKED:
1802                         uk = bkey_unpack_key(b, k);
1803                         return scnprintf(buf, size,
1804                                          "    failed unpacked at depth %u\n"
1805                                          "\t%llu:%llu\n",
1806                                          ilog2(j),
1807                                          uk.p.inode, uk.p.offset);
1808                 case BFLOAT_FAILED_PREV:
1809                         p = tree_to_prev_bkey(b, t, j);
1810                         l = is_power_of_2(j)
1811                                 ? btree_bkey_first(b, t)
1812                                 : tree_to_prev_bkey(b, t, j >> ffs(j));
1813                         r = is_power_of_2(j + 1)
1814                                 ? bkey_prev_all(b, t, btree_bkey_last(b, t))
1815                                 : tree_to_bkey(b, t, j >> (ffz(j) + 1));
1816
1817                         up = bkey_unpack_key(b, p);
1818                         uk = bkey_unpack_key(b, k);
1819                         bch_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
1820                         bch_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
1821
1822                         return scnprintf(buf, size,
1823                                          "    failed prev at depth %u\n"
1824                                          "\tkey starts at bit %u but first differing bit at %u\n"
1825                                          "\t%llu:%llu\n"
1826                                          "\t%llu:%llu\n"
1827                                          "\t%s\n"
1828                                          "\t%s\n",
1829                                          ilog2(j),
1830                                          bkey_greatest_differing_bit(b, l, r),
1831                                          bkey_greatest_differing_bit(b, p, k),
1832                                          uk.p.inode, uk.p.offset,
1833                                          up.p.inode, up.p.offset,
1834                                          buf1, buf2);
1835                 case BFLOAT_FAILED_OVERFLOW:
1836                         uk = bkey_unpack_key(b, k);
1837                         return scnprintf(buf, size,
1838                                          "    failed overflow at depth %u\n"
1839                                          "\t%llu:%llu\n",
1840                                          ilog2(j),
1841                                          uk.p.inode, uk.p.offset);
1842                 }
1843 out:
1844         *buf = '\0';
1845         return 0;
1846 }