]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
b7618e2b28f797998dca2fc394ee2d129b20adda
[bcachefs-tools-debian] / libbcachefs / bset.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code for working with individual keys, and sorted sets of keys with in a
4  * btree node
5  *
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "btree_cache.h"
11 #include "bset.h"
12 #include "eytzinger.h"
13 #include "util.h"
14
15 #include <asm/unaligned.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcachefs.h>
23
24 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
25                                                   struct btree *);
26
27 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
28 {
29         unsigned n = ARRAY_SIZE(iter->data);
30
31         while (n && __btree_node_iter_set_end(iter, n - 1))
32                 --n;
33
34         return n;
35 }
36
37 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
38 {
39         unsigned offset = __btree_node_key_to_offset(b, k);
40         struct bset_tree *t;
41
42         for_each_bset(b, t)
43                 if (offset <= t->end_offset) {
44                         EBUG_ON(offset < btree_bkey_first_offset(t));
45                         return t;
46                 }
47
48         BUG();
49 }
50
51 /*
52  * There are never duplicate live keys in the btree - but including keys that
53  * have been flagged as deleted (and will be cleaned up later) we _will_ see
54  * duplicates.
55  *
56  * Thus the sort order is: usual key comparison first, but for keys that compare
57  * equal the deleted key(s) come first, and the (at most one) live version comes
58  * last.
59  *
60  * The main reason for this is insertion: to handle overwrites, we first iterate
61  * over keys that compare equal to our insert key, and then insert immediately
62  * prior to the first key greater than the key we're inserting - our insert
63  * position will be after all keys that compare equal to our insert key, which
64  * by the time we actually do the insert will all be deleted.
65  */
66
67 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
68 {
69         struct bkey_packed *_k, *_n;
70         struct bkey k, n;
71         char buf[120];
72
73         if (!i->u64s)
74                 return;
75
76         for (_k = i->start, k = bkey_unpack_key(b, _k);
77              _k < vstruct_last(i);
78              _k = _n, k = n) {
79                 _n = bkey_next(_k);
80
81                 bch2_bkey_to_text(&PBUF(buf), &k);
82                 printk(KERN_ERR "block %u key %5u: %s\n", set,
83                        __btree_node_key_to_offset(b, _k), buf);
84
85                 if (_n == vstruct_last(i))
86                         continue;
87
88                 n = bkey_unpack_key(b, _n);
89
90                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
91                         printk(KERN_ERR "Key skipped backwards\n");
92                         continue;
93                 }
94
95                 /*
96                  * Weird check for duplicate non extent keys: extents are
97                  * deleted iff they have 0 size, so if it has zero size and it's
98                  * not deleted these aren't extents:
99                  */
100                 if (((!k.size && !bkey_deleted(&k)) ||
101                      (!n.size && !bkey_deleted(&n))) &&
102                     !bkey_deleted(&k) &&
103                     !bkey_cmp(n.p, k.p))
104                         printk(KERN_ERR "Duplicate keys\n");
105         }
106 }
107
108 void bch2_dump_btree_node(struct btree *b)
109 {
110         struct bset_tree *t;
111
112         console_lock();
113         for_each_bset(b, t)
114                 bch2_dump_bset(b, bset(b, t), t - b->set);
115         console_unlock();
116 }
117
118 void bch2_dump_btree_node_iter(struct btree *b,
119                               struct btree_node_iter *iter)
120 {
121         struct btree_node_iter_set *set;
122
123         printk(KERN_ERR "btree node iter with %u/%u sets:\n",
124                __btree_node_iter_used(iter), b->nsets);
125
126         btree_node_iter_for_each(iter, set) {
127                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
128                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
129                 struct bkey uk = bkey_unpack_key(b, k);
130                 char buf[100];
131
132                 bch2_bkey_to_text(&PBUF(buf), &uk);
133                 printk(KERN_ERR "set %zu key %u: %s\n",
134                        t - b->set, set->k, buf);
135         }
136 }
137
138 #ifdef CONFIG_BCACHEFS_DEBUG
139
140 void __bch2_verify_btree_nr_keys(struct btree *b)
141 {
142         struct bset_tree *t;
143         struct bkey_packed *k;
144         struct btree_nr_keys nr = { 0 };
145
146         for_each_bset(b, t)
147                 for (k = btree_bkey_first(b, t);
148                      k != btree_bkey_last(b, t);
149                      k = bkey_next(k))
150                         if (!bkey_whiteout(k))
151                                 btree_keys_account_key_add(&nr, t - b->set, k);
152
153         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
154 }
155
156 static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
157                                             struct btree *b)
158 {
159         struct btree_node_iter iter = *_iter;
160         const struct bkey_packed *k, *n;
161
162         k = bch2_btree_node_iter_peek_all(&iter, b);
163         __bch2_btree_node_iter_advance(&iter, b);
164         n = bch2_btree_node_iter_peek_all(&iter, b);
165
166         bkey_unpack_key(b, k);
167
168         if (n &&
169             bkey_iter_cmp(b, k, n) > 0) {
170                 struct btree_node_iter_set *set;
171                 struct bkey ku = bkey_unpack_key(b, k);
172                 struct bkey nu = bkey_unpack_key(b, n);
173                 char buf1[80], buf2[80];
174
175                 bch2_dump_btree_node(b);
176                 bch2_bkey_to_text(&PBUF(buf1), &ku);
177                 bch2_bkey_to_text(&PBUF(buf2), &nu);
178                 printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
179                        buf1, buf2);
180                 printk(KERN_ERR "iter was:");
181
182                 btree_node_iter_for_each(_iter, set) {
183                         struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
184                         struct bset_tree *t = bch2_bkey_to_bset(b, k);
185                         printk(" [%zi %zi]", t - b->set,
186                                k->_data - bset(b, t)->_data);
187                 }
188                 panic("\n");
189         }
190 }
191
192 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
193                                  struct btree *b)
194 {
195         struct btree_node_iter_set *set, *s2;
196         struct bkey_packed *k, *p;
197         struct bset_tree *t;
198
199         if (bch2_btree_node_iter_end(iter))
200                 return;
201
202         /* Verify no duplicates: */
203         btree_node_iter_for_each(iter, set)
204                 btree_node_iter_for_each(iter, s2)
205                         BUG_ON(set != s2 && set->end == s2->end);
206
207         /* Verify that set->end is correct: */
208         btree_node_iter_for_each(iter, set) {
209                 for_each_bset(b, t)
210                         if (set->end == t->end_offset)
211                                 goto found;
212                 BUG();
213 found:
214                 BUG_ON(set->k < btree_bkey_first_offset(t) ||
215                        set->k >= t->end_offset);
216         }
217
218         /* Verify iterator is sorted: */
219         btree_node_iter_for_each(iter, set)
220                 BUG_ON(set != iter->data &&
221                        btree_node_iter_cmp(b, set[-1], set[0]) > 0);
222
223         k = bch2_btree_node_iter_peek_all(iter, b);
224
225         for_each_bset(b, t) {
226                 if (iter->data[0].end == t->end_offset)
227                         continue;
228
229                 p = bch2_bkey_prev_all(b, t,
230                         bch2_btree_node_iter_bset_pos(iter, b, t));
231
232                 BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
233         }
234 }
235
236 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
237                             struct bkey_packed *insert, unsigned clobber_u64s)
238 {
239         struct bset_tree *t = bch2_bkey_to_bset(b, where);
240         struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
241         struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
242 #if 0
243         BUG_ON(prev &&
244                bkey_iter_cmp(b, prev, insert) > 0);
245 #else
246         if (prev &&
247             bkey_iter_cmp(b, prev, insert) > 0) {
248                 struct bkey k1 = bkey_unpack_key(b, prev);
249                 struct bkey k2 = bkey_unpack_key(b, insert);
250                 char buf1[100];
251                 char buf2[100];
252
253                 bch2_dump_btree_node(b);
254                 bch2_bkey_to_text(&PBUF(buf1), &k1);
255                 bch2_bkey_to_text(&PBUF(buf2), &k2);
256
257                 panic("prev > insert:\n"
258                       "prev    key %5u %s\n"
259                       "insert  key %5u %s\n",
260                        __btree_node_key_to_offset(b, prev), buf1,
261                        __btree_node_key_to_offset(b, insert), buf2);
262         }
263 #endif
264 #if 0
265         BUG_ON(next != btree_bkey_last(b, t) &&
266                bkey_iter_cmp(b, insert, next) > 0);
267 #else
268         if (next != btree_bkey_last(b, t) &&
269             bkey_iter_cmp(b, insert, next) > 0) {
270                 struct bkey k1 = bkey_unpack_key(b, insert);
271                 struct bkey k2 = bkey_unpack_key(b, next);
272                 char buf1[100];
273                 char buf2[100];
274
275                 bch2_dump_btree_node(b);
276                 bch2_bkey_to_text(&PBUF(buf1), &k1);
277                 bch2_bkey_to_text(&PBUF(buf2), &k2);
278
279                 panic("insert > next:\n"
280                       "insert  key %5u %s\n"
281                       "next    key %5u %s\n",
282                        __btree_node_key_to_offset(b, insert), buf1,
283                        __btree_node_key_to_offset(b, next), buf2);
284         }
285 #endif
286 }
287
288 #else
289
290 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
291                                                    struct btree *b) {}
292
293 #endif
294
295 /* Auxiliary search trees */
296
297 #define BFLOAT_FAILED_UNPACKED  U8_MAX
298 #define BFLOAT_FAILED           U8_MAX
299
300 struct bkey_float {
301         u8              exponent;
302         u8              key_offset;
303         u16             mantissa;
304 };
305 #define BKEY_MANTISSA_BITS      16
306
307 static unsigned bkey_float_byte_offset(unsigned idx)
308 {
309         return idx * sizeof(struct bkey_float);
310 }
311
312 struct ro_aux_tree {
313         struct bkey_float       f[0];
314 };
315
316 struct rw_aux_tree {
317         u16             offset;
318         struct bpos     k;
319 };
320
321 /*
322  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
323  * it used to be 64, but I realized the lookup code would touch slightly less
324  * memory if it was 128.
325  *
326  * It definites the number of bytes (in struct bset) per struct bkey_float in
327  * the auxiliar search tree - when we're done searching the bset_float tree we
328  * have this many bytes left that we do a linear search over.
329  *
330  * Since (after level 5) every level of the bset_tree is on a new cacheline,
331  * we're touching one fewer cacheline in the bset tree in exchange for one more
332  * cacheline in the linear search - but the linear search might stop before it
333  * gets to the second cacheline.
334  */
335
336 #define BSET_CACHELINE          128
337
338 /* Space required for the btree node keys */
339 static inline size_t btree_keys_bytes(struct btree *b)
340 {
341         return PAGE_SIZE << b->page_order;
342 }
343
344 static inline size_t btree_keys_cachelines(struct btree *b)
345 {
346         return btree_keys_bytes(b) / BSET_CACHELINE;
347 }
348
349 static inline size_t btree_aux_data_bytes(struct btree *b)
350 {
351         return btree_keys_cachelines(b) * 8;
352 }
353
354 static inline size_t btree_aux_data_u64s(struct btree *b)
355 {
356         return btree_aux_data_bytes(b) / sizeof(u64);
357 }
358
359 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
360 {
361         BUG_ON(t->aux_data_offset == U16_MAX);
362
363         switch (bset_aux_tree_type(t)) {
364         case BSET_NO_AUX_TREE:
365                 return t->aux_data_offset;
366         case BSET_RO_AUX_TREE:
367                 return t->aux_data_offset +
368                         DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
369                                      t->size * sizeof(u8), 8);
370         case BSET_RW_AUX_TREE:
371                 return t->aux_data_offset +
372                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
373         default:
374                 BUG();
375         }
376 }
377
378 static unsigned bset_aux_tree_buf_start(const struct btree *b,
379                                         const struct bset_tree *t)
380 {
381         return t == b->set
382                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
383                 : bset_aux_tree_buf_end(t - 1);
384 }
385
386 static void *__aux_tree_base(const struct btree *b,
387                              const struct bset_tree *t)
388 {
389         return b->aux_data + t->aux_data_offset * 8;
390 }
391
392 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
393                                             const struct bset_tree *t)
394 {
395         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
396
397         return __aux_tree_base(b, t);
398 }
399
400 static u8 *ro_aux_tree_prev(const struct btree *b,
401                             const struct bset_tree *t)
402 {
403         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
404
405         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
406 }
407
408 static struct bkey_float *bkey_float(const struct btree *b,
409                                      const struct bset_tree *t,
410                                      unsigned idx)
411 {
412         return ro_aux_tree_base(b, t)->f + idx;
413 }
414
415 static void bset_aux_tree_verify(struct btree *b)
416 {
417 #ifdef CONFIG_BCACHEFS_DEBUG
418         struct bset_tree *t;
419
420         for_each_bset(b, t) {
421                 if (t->aux_data_offset == U16_MAX)
422                         continue;
423
424                 BUG_ON(t != b->set &&
425                        t[-1].aux_data_offset == U16_MAX);
426
427                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
428                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
429                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
430         }
431 #endif
432 }
433
434 /* Memory allocation */
435
436 void bch2_btree_keys_free(struct btree *b)
437 {
438         vfree(b->aux_data);
439         b->aux_data = NULL;
440 }
441
442 #ifndef PAGE_KERNEL_EXEC
443 # define PAGE_KERNEL_EXEC PAGE_KERNEL
444 #endif
445
446 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
447 {
448         b->page_order   = page_order;
449         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
450                                     PAGE_KERNEL_EXEC);
451         if (!b->aux_data)
452                 return -ENOMEM;
453
454         return 0;
455 }
456
457 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
458 {
459         unsigned i;
460
461         b->nsets                = 0;
462         memset(&b->nr, 0, sizeof(b->nr));
463 #ifdef CONFIG_BCACHEFS_DEBUG
464         b->expensive_debug_checks = expensive_debug_checks;
465 #endif
466         for (i = 0; i < MAX_BSETS; i++)
467                 b->set[i].data_offset = U16_MAX;
468
469         bch2_bset_set_no_aux_tree(b, b->set);
470 }
471
472 /* Binary tree stuff for auxiliary search trees */
473
474 /*
475  * Cacheline/offset <-> bkey pointer arithmetic:
476  *
477  * t->tree is a binary search tree in an array; each node corresponds to a key
478  * in one cacheline in t->set (BSET_CACHELINE bytes).
479  *
480  * This means we don't have to store the full index of the key that a node in
481  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
482  * then bkey_float->m gives us the offset within that cacheline, in units of 8
483  * bytes.
484  *
485  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
486  * make this work.
487  *
488  * To construct the bfloat for an arbitrary key we need to know what the key
489  * immediately preceding it is: we have to check if the two keys differ in the
490  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
491  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
492  */
493
494 static inline void *bset_cacheline(const struct btree *b,
495                                    const struct bset_tree *t,
496                                    unsigned cacheline)
497 {
498         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
499                                    L1_CACHE_BYTES) +
500                 cacheline * BSET_CACHELINE;
501 }
502
503 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
504                                              const struct bset_tree *t,
505                                              unsigned cacheline,
506                                              unsigned offset)
507 {
508         return bset_cacheline(b, t, cacheline) + offset * 8;
509 }
510
511 static unsigned bkey_to_cacheline(const struct btree *b,
512                                   const struct bset_tree *t,
513                                   const struct bkey_packed *k)
514 {
515         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
516 }
517
518 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
519                                           const struct bset_tree *t,
520                                           unsigned cacheline,
521                                           const struct bkey_packed *k)
522 {
523         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
524 }
525
526 static unsigned bkey_to_cacheline_offset(const struct btree *b,
527                                          const struct bset_tree *t,
528                                          unsigned cacheline,
529                                          const struct bkey_packed *k)
530 {
531         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
532
533         EBUG_ON(m > U8_MAX);
534         return m;
535 }
536
537 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
538                                                const struct bset_tree *t,
539                                                unsigned j)
540 {
541         return cacheline_to_bkey(b, t,
542                         __eytzinger1_to_inorder(j, t->size, t->extra),
543                         bkey_float(b, t, j)->key_offset);
544 }
545
546 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
547                                              const struct bset_tree *t,
548                                              unsigned j)
549 {
550         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
551
552         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
553 }
554
555 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
556                                        const struct bset_tree *t)
557 {
558         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
559
560         return __aux_tree_base(b, t);
561 }
562
563 /*
564  * For the write set - the one we're currently inserting keys into - we don't
565  * maintain a full search tree, we just keep a simple lookup table in t->prev.
566  */
567 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
568                                           struct bset_tree *t,
569                                           unsigned j)
570 {
571         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
572 }
573
574 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
575                             unsigned j, struct bkey_packed *k)
576 {
577         EBUG_ON(k >= btree_bkey_last(b, t));
578
579         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
580                 .offset = __btree_node_key_to_offset(b, k),
581                 .k      = bkey_unpack_pos(b, k),
582         };
583 }
584
585 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
586                                         struct bset_tree *t)
587 {
588         struct bkey_packed *k = btree_bkey_first(b, t);
589         unsigned j = 0;
590
591         if (!btree_keys_expensive_checks(b))
592                 return;
593
594         BUG_ON(bset_has_ro_aux_tree(t));
595
596         if (!bset_has_rw_aux_tree(t))
597                 return;
598
599         BUG_ON(t->size < 1);
600         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
601
602         goto start;
603         while (1) {
604                 if (rw_aux_to_bkey(b, t, j) == k) {
605                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
606                                         bkey_unpack_pos(b, k)));
607 start:
608                         if (++j == t->size)
609                                 break;
610
611                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
612                                rw_aux_tree(b, t)[j - 1].offset);
613                 }
614
615                 k = bkey_next(k);
616                 BUG_ON(k >= btree_bkey_last(b, t));
617         }
618 }
619
620 /* returns idx of first entry >= offset: */
621 static unsigned rw_aux_tree_bsearch(struct btree *b,
622                                     struct bset_tree *t,
623                                     unsigned offset)
624 {
625         unsigned bset_offs = offset - btree_bkey_first_offset(t);
626         unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
627         unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
628
629         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
630         EBUG_ON(!t->size);
631         EBUG_ON(idx > t->size);
632
633         while (idx < t->size &&
634                rw_aux_tree(b, t)[idx].offset < offset)
635                 idx++;
636
637         while (idx &&
638                rw_aux_tree(b, t)[idx - 1].offset >= offset)
639                 idx--;
640
641         EBUG_ON(idx < t->size &&
642                 rw_aux_tree(b, t)[idx].offset < offset);
643         EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
644         EBUG_ON(idx + 1 < t->size &&
645                 rw_aux_tree(b, t)[idx].offset ==
646                 rw_aux_tree(b, t)[idx + 1].offset);
647
648         return idx;
649 }
650
651 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
652                                      const struct bkey_float *f,
653                                      unsigned idx)
654 {
655         u64 v;
656
657         EBUG_ON(!bkey_packed(k));
658
659         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
660
661         /*
662          * In little endian, we're shifting off low bits (and then the bits we
663          * want are at the low end), in big endian we're shifting off high bits
664          * (and then the bits we want are at the high end, so we shift them
665          * back down):
666          */
667 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
668         v >>= f->exponent & 7;
669 #else
670         v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
671 #endif
672         return (u16) v;
673 }
674
675 static void make_bfloat(struct btree *b, struct bset_tree *t,
676                         unsigned j,
677                         struct bkey_packed *min_key,
678                         struct bkey_packed *max_key)
679 {
680         struct bkey_float *f = bkey_float(b, t, j);
681         struct bkey_packed *m = tree_to_bkey(b, t, j);
682         struct bkey_packed *l, *r;
683         unsigned mantissa;
684         int shift, exponent, high_bit;
685
686         if (is_power_of_2(j)) {
687                 l = min_key;
688
689                 if (!l->u64s) {
690                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
691                                 struct bkey_i tmp;
692
693                                 bkey_init(&tmp.k);
694                                 tmp.k.p = b->data->min_key;
695                                 bkey_copy(l, &tmp);
696                         }
697                 }
698         } else {
699                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
700
701                 EBUG_ON(m < l);
702         }
703
704         if (is_power_of_2(j + 1)) {
705                 r = max_key;
706
707                 if (!r->u64s) {
708                         if (!bkey_pack_pos(r, t->max_key, b)) {
709                                 struct bkey_i tmp;
710
711                                 bkey_init(&tmp.k);
712                                 tmp.k.p = t->max_key;
713                                 bkey_copy(r, &tmp);
714                         }
715                 }
716         } else {
717                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
718
719                 EBUG_ON(m > r);
720         }
721
722         /*
723          * for failed bfloats, the lookup code falls back to comparing against
724          * the original key.
725          */
726
727         if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
728             !b->nr_key_bits) {
729                 f->exponent = BFLOAT_FAILED_UNPACKED;
730                 return;
731         }
732
733         /*
734          * The greatest differing bit of l and r is the first bit we must
735          * include in the bfloat mantissa we're creating in order to do
736          * comparisons - that bit always becomes the high bit of
737          * bfloat->mantissa, and thus the exponent we're calculating here is
738          * the position of what will become the low bit in bfloat->mantissa:
739          *
740          * Note that this may be negative - we may be running off the low end
741          * of the key: we handle this later:
742          */
743         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
744                        min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
745         exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
746
747         /*
748          * Then we calculate the actual shift value, from the start of the key
749          * (k->_data), to get the key bits starting at exponent:
750          */
751 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
752         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
753
754         EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
755 #else
756         shift = high_bit_offset +
757                 b->nr_key_bits -
758                 exponent -
759                 BKEY_MANTISSA_BITS;
760
761         EBUG_ON(shift < KEY_PACKED_BITS_START);
762 #endif
763         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
764
765         f->exponent = shift;
766         mantissa = bkey_mantissa(m, f, j);
767
768         /*
769          * If we've got garbage bits, set them to all 1s - it's legal for the
770          * bfloat to compare larger than the original key, but not smaller:
771          */
772         if (exponent < 0)
773                 mantissa |= ~(~0U << -exponent);
774
775         f->mantissa = mantissa;
776 }
777
778 /* bytes remaining - only valid for last bset: */
779 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
780 {
781         bset_aux_tree_verify(b);
782
783         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
784 }
785
786 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
787 {
788         return __bset_tree_capacity(b, t) /
789                 (sizeof(struct bkey_float) + sizeof(u8));
790 }
791
792 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
793 {
794         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
795 }
796
797 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
798 {
799         struct bkey_packed *k;
800
801         t->size = 1;
802         t->extra = BSET_RW_AUX_TREE_VAL;
803         rw_aux_tree(b, t)[0].offset =
804                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
805
806         for (k = btree_bkey_first(b, t);
807              k != btree_bkey_last(b, t);
808              k = bkey_next(k)) {
809                 if (t->size == bset_rw_tree_capacity(b, t))
810                         break;
811
812                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
813                     L1_CACHE_BYTES)
814                         rw_aux_tree_set(b, t, t->size++, k);
815         }
816 }
817
818 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
819 {
820         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
821         struct bkey_packed min_key, max_key;
822         unsigned j, cacheline = 1;
823
824         /* signal to make_bfloat() that they're uninitialized: */
825         min_key.u64s = max_key.u64s = 0;
826
827         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
828                       bset_ro_tree_capacity(b, t));
829 retry:
830         if (t->size < 2) {
831                 t->size = 0;
832                 t->extra = BSET_NO_AUX_TREE_VAL;
833                 return;
834         }
835
836         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
837
838         /* First we figure out where the first key in each cacheline is */
839         eytzinger1_for_each(j, t->size) {
840                 while (bkey_to_cacheline(b, t, k) < cacheline)
841                         prev = k, k = bkey_next(k);
842
843                 if (k >= btree_bkey_last(b, t)) {
844                         /* XXX: this path sucks */
845                         t->size--;
846                         goto retry;
847                 }
848
849                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
850                 bkey_float(b, t, j)->key_offset =
851                         bkey_to_cacheline_offset(b, t, cacheline++, k);
852
853                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
854                 EBUG_ON(tree_to_bkey(b, t, j) != k);
855         }
856
857         while (bkey_next(k) != btree_bkey_last(b, t))
858                 k = bkey_next(k);
859
860         t->max_key = bkey_unpack_pos(b, k);
861
862         /* Then we build the tree */
863         eytzinger1_for_each(j, t->size)
864                 make_bfloat(b, t, j, &min_key, &max_key);
865 }
866
867 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
868 {
869         struct bset_tree *i;
870
871         for (i = b->set; i != t; i++)
872                 BUG_ON(bset_has_rw_aux_tree(i));
873
874         bch2_bset_set_no_aux_tree(b, t);
875
876         /* round up to next cacheline: */
877         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
878                                       SMP_CACHE_BYTES / sizeof(u64));
879
880         bset_aux_tree_verify(b);
881 }
882
883 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
884                              bool writeable)
885 {
886         if (writeable
887             ? bset_has_rw_aux_tree(t)
888             : bset_has_ro_aux_tree(t))
889                 return;
890
891         bset_alloc_tree(b, t);
892
893         if (!__bset_tree_capacity(b, t))
894                 return;
895
896         if (writeable)
897                 __build_rw_aux_tree(b, t);
898         else
899                 __build_ro_aux_tree(b, t);
900
901         bset_aux_tree_verify(b);
902 }
903
904 void bch2_bset_init_first(struct btree *b, struct bset *i)
905 {
906         struct bset_tree *t;
907
908         BUG_ON(b->nsets);
909
910         memset(i, 0, sizeof(*i));
911         get_random_bytes(&i->seq, sizeof(i->seq));
912         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
913
914         t = &b->set[b->nsets++];
915         set_btree_bset(b, t, i);
916 }
917
918 void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
919                          struct btree_node_entry *bne)
920 {
921         struct bset *i = &bne->keys;
922         struct bset_tree *t;
923
924         BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
925         BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
926         BUG_ON(b->nsets >= MAX_BSETS);
927
928         memset(i, 0, sizeof(*i));
929         i->seq = btree_bset_first(b)->seq;
930         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
931
932         t = &b->set[b->nsets++];
933         set_btree_bset(b, t, i);
934 }
935
936 /*
937  * find _some_ key in the same bset as @k that precedes @k - not necessarily the
938  * immediate predecessor:
939  */
940 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
941                                        struct bkey_packed *k)
942 {
943         struct bkey_packed *p;
944         unsigned offset;
945         int j;
946
947         EBUG_ON(k < btree_bkey_first(b, t) ||
948                 k > btree_bkey_last(b, t));
949
950         if (k == btree_bkey_first(b, t))
951                 return NULL;
952
953         switch (bset_aux_tree_type(t)) {
954         case BSET_NO_AUX_TREE:
955                 p = btree_bkey_first(b, t);
956                 break;
957         case BSET_RO_AUX_TREE:
958                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
959
960                 do {
961                         p = j ? tree_to_bkey(b, t,
962                                         __inorder_to_eytzinger1(j--,
963                                                         t->size, t->extra))
964                               : btree_bkey_first(b, t);
965                 } while (p >= k);
966                 break;
967         case BSET_RW_AUX_TREE:
968                 offset = __btree_node_key_to_offset(b, k);
969                 j = rw_aux_tree_bsearch(b, t, offset);
970                 p = j ? rw_aux_to_bkey(b, t, j - 1)
971                       : btree_bkey_first(b, t);
972                 break;
973         }
974
975         return p;
976 }
977
978 struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
979                                           struct bset_tree *t,
980                                           struct bkey_packed *k,
981                                           unsigned min_key_type)
982 {
983         struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
984
985         while ((p = __bkey_prev(b, t, k)) && !ret) {
986                 for (i = p; i != k; i = bkey_next(i))
987                         if (i->type >= min_key_type)
988                                 ret = i;
989
990                 k = p;
991         }
992
993         if (btree_keys_expensive_checks(b)) {
994                 BUG_ON(ret >= orig_k);
995
996                 for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
997                      i != orig_k;
998                      i = bkey_next(i))
999                         BUG_ON(i->type >= min_key_type);
1000         }
1001
1002         return ret;
1003 }
1004
1005 /* Insert */
1006
1007 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1008                                             struct bset_tree *t,
1009                                             struct bkey_packed *k)
1010 {
1011         unsigned offset = __btree_node_key_to_offset(b, k);
1012         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1013
1014         if (j < t->size &&
1015             rw_aux_tree(b, t)[j].offset == offset)
1016                 rw_aux_tree_set(b, t, j, k);
1017
1018         bch2_bset_verify_rw_aux_tree(b, t);
1019 }
1020
1021 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1022                                             struct bset_tree *t,
1023                                             struct bkey_packed *k)
1024 {
1025         struct bkey_packed min_key, max_key;
1026         unsigned inorder, j;
1027
1028         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1029
1030         /* signal to make_bfloat() that they're uninitialized: */
1031         min_key.u64s = max_key.u64s = 0;
1032
1033         if (bkey_next(k) == btree_bkey_last(b, t)) {
1034                 t->max_key = bkey_unpack_pos(b, k);
1035
1036                 for (j = 1; j < t->size; j = j * 2 + 1)
1037                         make_bfloat(b, t, j, &min_key, &max_key);
1038         }
1039
1040         inorder = bkey_to_cacheline(b, t, k);
1041
1042         if (inorder &&
1043             inorder < t->size) {
1044                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1045
1046                 if (k == tree_to_bkey(b, t, j)) {
1047                         /* Fix the node this key corresponds to */
1048                         make_bfloat(b, t, j, &min_key, &max_key);
1049
1050                         /* Children for which this key is the right boundary */
1051                         for (j = eytzinger1_left_child(j);
1052                              j < t->size;
1053                              j = eytzinger1_right_child(j))
1054                                 make_bfloat(b, t, j, &min_key, &max_key);
1055                 }
1056         }
1057
1058         if (inorder + 1 < t->size) {
1059                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1060
1061                 if (k == tree_to_prev_bkey(b, t, j)) {
1062                         make_bfloat(b, t, j, &min_key, &max_key);
1063
1064                         /* Children for which this key is the left boundary */
1065                         for (j = eytzinger1_right_child(j);
1066                              j < t->size;
1067                              j = eytzinger1_left_child(j))
1068                                 make_bfloat(b, t, j, &min_key, &max_key);
1069                 }
1070         }
1071 }
1072
1073 /**
1074  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1075  * modified, fix any auxiliary search tree by remaking all the nodes in the
1076  * auxiliary search tree that @k corresponds to
1077  */
1078 void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
1079 {
1080         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1081
1082         switch (bset_aux_tree_type(t)) {
1083         case BSET_NO_AUX_TREE:
1084                 break;
1085         case BSET_RO_AUX_TREE:
1086                 ro_aux_tree_fix_invalidated_key(b, t, k);
1087                 break;
1088         case BSET_RW_AUX_TREE:
1089                 rw_aux_tree_fix_invalidated_key(b, t, k);
1090                 break;
1091         }
1092 }
1093
1094 static void bch2_bset_fix_lookup_table(struct btree *b,
1095                                        struct bset_tree *t,
1096                                        struct bkey_packed *_where,
1097                                        unsigned clobber_u64s,
1098                                        unsigned new_u64s)
1099 {
1100         int shift = new_u64s - clobber_u64s;
1101         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1102
1103         EBUG_ON(bset_has_ro_aux_tree(t));
1104
1105         if (!bset_has_rw_aux_tree(t))
1106                 return;
1107
1108         /* returns first entry >= where */
1109         l = rw_aux_tree_bsearch(b, t, where);
1110
1111         if (!l) /* never delete first entry */
1112                 l++;
1113         else if (l < t->size &&
1114                  where < t->end_offset &&
1115                  rw_aux_tree(b, t)[l].offset == where)
1116                 rw_aux_tree_set(b, t, l++, _where);
1117
1118         /* l now > where */
1119
1120         for (j = l;
1121              j < t->size &&
1122              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1123              j++)
1124                 ;
1125
1126         if (j < t->size &&
1127             rw_aux_tree(b, t)[j].offset + shift ==
1128             rw_aux_tree(b, t)[l - 1].offset)
1129                 j++;
1130
1131         memmove(&rw_aux_tree(b, t)[l],
1132                 &rw_aux_tree(b, t)[j],
1133                 (void *) &rw_aux_tree(b, t)[t->size] -
1134                 (void *) &rw_aux_tree(b, t)[j]);
1135         t->size -= j - l;
1136
1137         for (j = l; j < t->size; j++)
1138                rw_aux_tree(b, t)[j].offset += shift;
1139
1140         EBUG_ON(l < t->size &&
1141                 rw_aux_tree(b, t)[l].offset ==
1142                 rw_aux_tree(b, t)[l - 1].offset);
1143
1144         if (t->size < bset_rw_tree_capacity(b, t) &&
1145             (l < t->size
1146              ? rw_aux_tree(b, t)[l].offset
1147              : t->end_offset) -
1148             rw_aux_tree(b, t)[l - 1].offset >
1149             L1_CACHE_BYTES / sizeof(u64)) {
1150                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1151                 struct bkey_packed *end = l < t->size
1152                         ? rw_aux_to_bkey(b, t, l)
1153                         : btree_bkey_last(b, t);
1154                 struct bkey_packed *k = start;
1155
1156                 while (1) {
1157                         k = bkey_next(k);
1158                         if (k == end)
1159                                 break;
1160
1161                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1162                                 memmove(&rw_aux_tree(b, t)[l + 1],
1163                                         &rw_aux_tree(b, t)[l],
1164                                         (void *) &rw_aux_tree(b, t)[t->size] -
1165                                         (void *) &rw_aux_tree(b, t)[l]);
1166                                 t->size++;
1167                                 rw_aux_tree_set(b, t, l, k);
1168                                 break;
1169                         }
1170                 }
1171         }
1172
1173         bch2_bset_verify_rw_aux_tree(b, t);
1174         bset_aux_tree_verify(b);
1175 }
1176
1177 void bch2_bset_insert(struct btree *b,
1178                       struct btree_node_iter *iter,
1179                       struct bkey_packed *where,
1180                       struct bkey_i *insert,
1181                       unsigned clobber_u64s)
1182 {
1183         struct bkey_format *f = &b->format;
1184         struct bset_tree *t = bset_tree_last(b);
1185         struct bkey_packed packed, *src = bkey_to_packed(insert);
1186
1187         bch2_bset_verify_rw_aux_tree(b, t);
1188         bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
1189
1190         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1191                 src = &packed;
1192
1193         if (!bkey_whiteout(&insert->k))
1194                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1195
1196         if (src->u64s != clobber_u64s) {
1197                 u64 *src_p = where->_data + clobber_u64s;
1198                 u64 *dst_p = where->_data + src->u64s;
1199
1200                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1201                         (int) clobber_u64s - src->u64s);
1202
1203                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1204                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1205                 set_btree_bset_end(b, t);
1206         }
1207
1208         memcpy_u64s(where, src,
1209                     bkeyp_key_u64s(f, src));
1210         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1211                     bkeyp_val_u64s(f, src));
1212
1213         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1214
1215         bch2_verify_btree_nr_keys(b);
1216 }
1217
1218 void bch2_bset_delete(struct btree *b,
1219                       struct bkey_packed *where,
1220                       unsigned clobber_u64s)
1221 {
1222         struct bset_tree *t = bset_tree_last(b);
1223         u64 *src_p = where->_data + clobber_u64s;
1224         u64 *dst_p = where->_data;
1225
1226         bch2_bset_verify_rw_aux_tree(b, t);
1227
1228         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1229
1230         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1231         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1232         set_btree_bset_end(b, t);
1233
1234         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1235 }
1236
1237 /* Lookup */
1238
1239 __flatten
1240 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1241                                 struct bset_tree *t,
1242                                 struct bpos *search,
1243                                 const struct bkey_packed *packed_search)
1244 {
1245         unsigned l = 0, r = t->size;
1246
1247         while (l + 1 != r) {
1248                 unsigned m = (l + r) >> 1;
1249
1250                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
1251                         l = m;
1252                 else
1253                         r = m;
1254         }
1255
1256         return rw_aux_to_bkey(b, t, l);
1257 }
1258
1259 static inline void prefetch_four_cachelines(void *p)
1260 {
1261 #ifdef CONFIG_X86_64
1262         asm(".intel_syntax noprefix;"
1263             "prefetcht0 [%0 - 127 + 64 * 0];"
1264             "prefetcht0 [%0 - 127 + 64 * 1];"
1265             "prefetcht0 [%0 - 127 + 64 * 2];"
1266             "prefetcht0 [%0 - 127 + 64 * 3];"
1267             ".att_syntax prefix;"
1268             :
1269             : "r" (p + 127));
1270 #else
1271         prefetch(p + L1_CACHE_BYTES * 0);
1272         prefetch(p + L1_CACHE_BYTES * 1);
1273         prefetch(p + L1_CACHE_BYTES * 2);
1274         prefetch(p + L1_CACHE_BYTES * 3);
1275 #endif
1276 }
1277
1278 static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
1279                                               const struct bkey_float *f,
1280                                               unsigned idx)
1281 {
1282 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1283         unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
1284
1285         return f->exponent > key_bits_start;
1286 #else
1287         unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
1288
1289         return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
1290 #endif
1291 }
1292
1293 __flatten
1294 static struct bkey_packed *bset_search_tree(const struct btree *b,
1295                                 struct bset_tree *t,
1296                                 struct bpos *search,
1297                                 const struct bkey_packed *packed_search)
1298 {
1299         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1300         struct bkey_float *f;
1301         struct bkey_packed *k;
1302         unsigned inorder, n = 1, l, r;
1303         int cmp;
1304
1305         do {
1306                 if (likely(n << 4 < t->size))
1307                         prefetch(&base->f[n << 4]);
1308
1309                 f = &base->f[n];
1310
1311                 if (!unlikely(packed_search))
1312                         goto slowpath;
1313                 if (unlikely(f->exponent >= BFLOAT_FAILED))
1314                         goto slowpath;
1315
1316                 l = f->mantissa;
1317                 r = bkey_mantissa(packed_search, f, n);
1318
1319                 if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
1320                         goto slowpath;
1321
1322                 n = n * 2 + (l < r);
1323                 continue;
1324 slowpath:
1325                 k = tree_to_bkey(b, t, n);
1326                 cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
1327                 if (!cmp)
1328                         return k;
1329
1330                 n = n * 2 + (cmp < 0);
1331         } while (n < t->size);
1332
1333         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1334
1335         /*
1336          * n would have been the node we recursed to - the low bit tells us if
1337          * we recursed left or recursed right.
1338          */
1339         if (likely(!(n & 1))) {
1340                 --inorder;
1341                 if (unlikely(!inorder))
1342                         return btree_bkey_first(b, t);
1343
1344                 f = &base->f[eytzinger1_prev(n >> 1, t->size)];
1345         }
1346
1347         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1348 }
1349
1350 static __always_inline __flatten
1351 struct bkey_packed *__bch2_bset_search(struct btree *b,
1352                                 struct bset_tree *t,
1353                                 struct bpos *search,
1354                                 const struct bkey_packed *lossy_packed_search)
1355 {
1356
1357         /*
1358          * First, we search for a cacheline, then lastly we do a linear search
1359          * within that cacheline.
1360          *
1361          * To search for the cacheline, there's three different possibilities:
1362          *  * The set is too small to have a search tree, so we just do a linear
1363          *    search over the whole set.
1364          *  * The set is the one we're currently inserting into; keeping a full
1365          *    auxiliary search tree up to date would be too expensive, so we
1366          *    use a much simpler lookup table to do a binary search -
1367          *    bset_search_write_set().
1368          *  * Or we use the auxiliary search tree we constructed earlier -
1369          *    bset_search_tree()
1370          */
1371
1372         switch (bset_aux_tree_type(t)) {
1373         case BSET_NO_AUX_TREE:
1374                 return btree_bkey_first(b, t);
1375         case BSET_RW_AUX_TREE:
1376                 return bset_search_write_set(b, t, search, lossy_packed_search);
1377         case BSET_RO_AUX_TREE:
1378                 /*
1379                  * Each node in the auxiliary search tree covers a certain range
1380                  * of bits, and keys above and below the set it covers might
1381                  * differ outside those bits - so we have to special case the
1382                  * start and end - handle that here:
1383                  */
1384
1385                 if (bkey_cmp(*search, t->max_key) > 0)
1386                         return btree_bkey_last(b, t);
1387
1388                 return bset_search_tree(b, t, search, lossy_packed_search);
1389         default:
1390                 unreachable();
1391         }
1392 }
1393
1394 static __always_inline __flatten
1395 struct bkey_packed *bch2_bset_search_linear(struct btree *b,
1396                                 struct bset_tree *t,
1397                                 struct bpos *search,
1398                                 struct bkey_packed *packed_search,
1399                                 const struct bkey_packed *lossy_packed_search,
1400                                 struct bkey_packed *m)
1401 {
1402         if (lossy_packed_search)
1403                 while (m != btree_bkey_last(b, t) &&
1404                        bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
1405                                               m) > 0)
1406                         m = bkey_next(m);
1407
1408         if (!packed_search)
1409                 while (m != btree_bkey_last(b, t) &&
1410                        bkey_iter_pos_cmp(b, search, m) > 0)
1411                         m = bkey_next(m);
1412
1413         if (btree_keys_expensive_checks(b)) {
1414                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1415
1416                 BUG_ON(prev &&
1417                        bkey_iter_cmp_p_or_unp(b, search, packed_search,
1418                                               prev) <= 0);
1419         }
1420
1421         return m;
1422 }
1423
1424 /*
1425  * Returns the first key greater than or equal to @search
1426  */
1427 static __always_inline __flatten
1428 struct bkey_packed *bch2_bset_search(struct btree *b,
1429                                 struct bset_tree *t,
1430                                 struct bpos *search,
1431                                 struct bkey_packed *packed_search,
1432                                 const struct bkey_packed *lossy_packed_search)
1433 {
1434         struct bkey_packed *m = __bch2_bset_search(b, t, search,
1435                                                    lossy_packed_search);
1436
1437         return bch2_bset_search_linear(b, t, search,
1438                                  packed_search, lossy_packed_search, m);
1439 }
1440
1441 /* Btree node iterator */
1442
1443 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
1444                               struct btree *b,
1445                               const struct bkey_packed *k,
1446                               const struct bkey_packed *end)
1447 {
1448         if (k != end) {
1449                 struct btree_node_iter_set *pos;
1450
1451                 btree_node_iter_for_each(iter, pos)
1452                         ;
1453
1454                 BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
1455                 *pos = (struct btree_node_iter_set) {
1456                         __btree_node_key_to_offset(b, k),
1457                         __btree_node_key_to_offset(b, end)
1458                 };
1459         }
1460 }
1461
1462 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1463                                struct btree *b,
1464                                const struct bkey_packed *k,
1465                                const struct bkey_packed *end)
1466 {
1467         __bch2_btree_node_iter_push(iter, b, k, end);
1468         bch2_btree_node_iter_sort(iter, b);
1469 }
1470
1471 noinline __flatten __attribute__((cold))
1472 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1473                               struct btree *b, struct bpos *search)
1474 {
1475         struct bset_tree *t;
1476
1477         trace_bkey_pack_pos_fail(search);
1478
1479         for_each_bset(b, t)
1480                 __bch2_btree_node_iter_push(iter, b,
1481                         bch2_bset_search(b, t, search, NULL, NULL),
1482                         btree_bkey_last(b, t));
1483
1484         bch2_btree_node_iter_sort(iter, b);
1485 }
1486
1487 /**
1488  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1489  * given position
1490  *
1491  * Main entry point to the lookup code for individual btree nodes:
1492  *
1493  * NOTE:
1494  *
1495  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1496  * keys. This doesn't matter for most code, but it does matter for lookups.
1497  *
1498  * Some adjacent keys with a string of equal keys:
1499  *      i j k k k k l m
1500  *
1501  * If you search for k, the lookup code isn't guaranteed to return you any
1502  * specific k. The lookup code is conceptually doing a binary search and
1503  * iterating backwards is very expensive so if the pivot happens to land at the
1504  * last k that's what you'll get.
1505  *
1506  * This works out ok, but it's something to be aware of:
1507  *
1508  *  - For non extents, we guarantee that the live key comes last - see
1509  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1510  *    see will only be deleted keys you don't care about.
1511  *
1512  *  - For extents, deleted keys sort last (see the comment at the top of this
1513  *    file). But when you're searching for extents, you actually want the first
1514  *    key strictly greater than your search key - an extent that compares equal
1515  *    to the search key is going to have 0 sectors after the search key.
1516  *
1517  *    But this does mean that we can't just search for
1518  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1519  *    the range we want - if we're unlucky and there's an extent that ends
1520  *    exactly where we searched, then there could be a deleted key at the same
1521  *    position and we'd get that when we search instead of the preceding extent
1522  *    we needed.
1523  *
1524  *    So we've got to search for start_of_range, then after the lookup iterate
1525  *    past any extents that compare equal to the position we searched for.
1526  */
1527 __flatten
1528 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1529                                struct btree *b, struct bpos *search)
1530 {
1531         struct bkey_packed p, *packed_search = NULL;
1532         struct btree_node_iter_set *pos = iter->data;
1533         struct bkey_packed *k[MAX_BSETS];
1534         unsigned i;
1535
1536         EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
1537         bset_aux_tree_verify(b);
1538
1539         memset(iter, 0, sizeof(*iter));
1540
1541         switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
1542         case BKEY_PACK_POS_EXACT:
1543                 packed_search = &p;
1544                 break;
1545         case BKEY_PACK_POS_SMALLER:
1546                 packed_search = NULL;
1547                 break;
1548         case BKEY_PACK_POS_FAIL:
1549                 btree_node_iter_init_pack_failed(iter, b, search);
1550                 return;
1551         }
1552
1553         for (i = 0; i < b->nsets; i++) {
1554                 k[i] = __bch2_bset_search(b, b->set + i, search, &p);
1555                 prefetch_four_cachelines(k[i]);
1556         }
1557
1558         for (i = 0; i < b->nsets; i++) {
1559                 struct bset_tree *t = b->set + i;
1560                 struct bkey_packed *end = btree_bkey_last(b, t);
1561
1562                 k[i] = bch2_bset_search_linear(b, t, search,
1563                                                packed_search, &p, k[i]);
1564                 if (k[i] != end)
1565                         *pos++ = (struct btree_node_iter_set) {
1566                                 __btree_node_key_to_offset(b, k[i]),
1567                                 __btree_node_key_to_offset(b, end)
1568                         };
1569         }
1570
1571         bch2_btree_node_iter_sort(iter, b);
1572 }
1573
1574 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1575                                           struct btree *b)
1576 {
1577         struct bset_tree *t;
1578
1579         memset(iter, 0, sizeof(*iter));
1580
1581         for_each_bset(b, t)
1582                 __bch2_btree_node_iter_push(iter, b,
1583                                            btree_bkey_first(b, t),
1584                                            btree_bkey_last(b, t));
1585         bch2_btree_node_iter_sort(iter, b);
1586 }
1587
1588 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1589                                                   struct btree *b,
1590                                                   struct bset_tree *t)
1591 {
1592         struct btree_node_iter_set *set;
1593
1594         btree_node_iter_for_each(iter, set)
1595                 if (set->end == t->end_offset)
1596                         return __btree_node_offset_to_key(b, set->k);
1597
1598         return btree_bkey_last(b, t);
1599 }
1600
1601 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1602                                             struct btree *b,
1603                                             unsigned first)
1604 {
1605         bool ret;
1606
1607         if ((ret = (btree_node_iter_cmp(b,
1608                                         iter->data[first],
1609                                         iter->data[first + 1]) > 0)))
1610                 swap(iter->data[first], iter->data[first + 1]);
1611         return ret;
1612 }
1613
1614 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1615                                struct btree *b)
1616 {
1617         /* unrolled bubble sort: */
1618
1619         if (!__btree_node_iter_set_end(iter, 2)) {
1620                 btree_node_iter_sort_two(iter, b, 0);
1621                 btree_node_iter_sort_two(iter, b, 1);
1622         }
1623
1624         if (!__btree_node_iter_set_end(iter, 1))
1625                 btree_node_iter_sort_two(iter, b, 0);
1626 }
1627
1628 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1629                                    struct btree_node_iter_set *set)
1630 {
1631         struct btree_node_iter_set *last =
1632                 iter->data + ARRAY_SIZE(iter->data) - 1;
1633
1634         memmove(&set[0], &set[1], (void *) last - (void *) set);
1635         *last = (struct btree_node_iter_set) { 0, 0 };
1636 }
1637
1638 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1639                                                   struct btree *b)
1640 {
1641         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1642
1643         EBUG_ON(iter->data->k > iter->data->end);
1644
1645         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1646                 bch2_btree_node_iter_set_drop(iter, iter->data);
1647                 return;
1648         }
1649
1650         if (__btree_node_iter_set_end(iter, 1))
1651                 return;
1652
1653         if (!btree_node_iter_sort_two(iter, b, 0))
1654                 return;
1655
1656         if (__btree_node_iter_set_end(iter, 2))
1657                 return;
1658
1659         btree_node_iter_sort_two(iter, b, 1);
1660 }
1661
1662 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1663                                   struct btree *b)
1664 {
1665         if (btree_keys_expensive_checks(b)) {
1666                 bch2_btree_node_iter_verify(iter, b);
1667                 bch2_btree_node_iter_next_check(iter, b);
1668         }
1669
1670         __bch2_btree_node_iter_advance(iter, b);
1671 }
1672
1673 /*
1674  * Expensive:
1675  */
1676 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
1677                                                   struct btree *b)
1678 {
1679         struct bkey_packed *k, *prev = NULL;
1680         struct btree_node_iter_set *set;
1681         struct bset_tree *t;
1682         unsigned end = 0;
1683
1684         bch2_btree_node_iter_verify(iter, b);
1685
1686         for_each_bset(b, t) {
1687                 k = bch2_bkey_prev_all(b, t,
1688                         bch2_btree_node_iter_bset_pos(iter, b, t));
1689                 if (k &&
1690                     (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
1691                         prev = k;
1692                         end = t->end_offset;
1693                 }
1694         }
1695
1696         if (!prev)
1697                 return NULL;
1698
1699         /*
1700          * We're manually memmoving instead of just calling sort() to ensure the
1701          * prev we picked ends up in slot 0 - sort won't necessarily put it
1702          * there because of duplicate deleted keys:
1703          */
1704         btree_node_iter_for_each(iter, set)
1705                 if (set->end == end)
1706                         goto found;
1707
1708         BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
1709 found:
1710         BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
1711
1712         memmove(&iter->data[1],
1713                 &iter->data[0],
1714                 (void *) set - (void *) &iter->data[0]);
1715
1716         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1717         iter->data[0].end = end;
1718
1719         bch2_btree_node_iter_verify(iter, b);
1720         return prev;
1721 }
1722
1723 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
1724                                                      struct btree *b,
1725                                                      unsigned min_key_type)
1726 {
1727         struct bkey_packed *prev;
1728
1729         do {
1730                 prev = bch2_btree_node_iter_prev_all(iter, b);
1731         } while (prev && prev->type < min_key_type);
1732
1733         return prev;
1734 }
1735
1736 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1737                                                  struct btree *b,
1738                                                  struct bkey *u)
1739 {
1740         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1741
1742         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1743 }
1744
1745 /* Mergesort */
1746
1747 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1748 {
1749         struct bset_tree *t;
1750
1751         for_each_bset(b, t) {
1752                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1753                 size_t j;
1754
1755                 stats->sets[type].nr++;
1756                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1757                         sizeof(u64);
1758
1759                 if (bset_has_ro_aux_tree(t)) {
1760                         stats->floats += t->size - 1;
1761
1762                         for (j = 1; j < t->size; j++)
1763                                 stats->failed +=
1764                                         bkey_float(b, t, j)->exponent ==
1765                                         BFLOAT_FAILED;
1766                 }
1767         }
1768 }
1769
1770 void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
1771                          struct bkey_packed *k)
1772 {
1773         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1774         struct bkey uk;
1775         unsigned j, inorder;
1776
1777         if (out->pos != out->end)
1778                 *out->pos = '\0';
1779
1780         if (!bset_has_ro_aux_tree(t))
1781                 return;
1782
1783         inorder = bkey_to_cacheline(b, t, k);
1784         if (!inorder || inorder >= t->size)
1785                 return;
1786
1787         j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1788         if (k != tree_to_bkey(b, t, j))
1789                 return;
1790
1791         switch (bkey_float(b, t, j)->exponent) {
1792         case BFLOAT_FAILED:
1793                 uk = bkey_unpack_key(b, k);
1794                 pr_buf(out,
1795                        "    failed unpacked at depth %u\n"
1796                        "\t%llu:%llu\n",
1797                        ilog2(j),
1798                        uk.p.inode, uk.p.offset);
1799                 break;
1800         }
1801 }