]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
Update bcachefs sources to 7e07e19c29 bcachefs: Traverse iterator in journal replay
[bcachefs-tools-debian] / libbcachefs / bset.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code for working with individual keys, and sorted sets of keys with in a
4  * btree node
5  *
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "btree_cache.h"
11 #include "bset.h"
12 #include "eytzinger.h"
13 #include "util.h"
14
15 #include <asm/unaligned.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcachefs.h>
23
24 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
25                                                   struct btree *);
26
27 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
28 {
29         unsigned n = ARRAY_SIZE(iter->data);
30
31         while (n && __btree_node_iter_set_end(iter, n - 1))
32                 --n;
33
34         return n;
35 }
36
37 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
38 {
39         unsigned offset = __btree_node_key_to_offset(b, k);
40         struct bset_tree *t;
41
42         for_each_bset(b, t)
43                 if (offset <= t->end_offset) {
44                         EBUG_ON(offset < btree_bkey_first_offset(t));
45                         return t;
46                 }
47
48         BUG();
49 }
50
51 /*
52  * There are never duplicate live keys in the btree - but including keys that
53  * have been flagged as deleted (and will be cleaned up later) we _will_ see
54  * duplicates.
55  *
56  * Thus the sort order is: usual key comparison first, but for keys that compare
57  * equal the deleted key(s) come first, and the (at most one) live version comes
58  * last.
59  *
60  * The main reason for this is insertion: to handle overwrites, we first iterate
61  * over keys that compare equal to our insert key, and then insert immediately
62  * prior to the first key greater than the key we're inserting - our insert
63  * position will be after all keys that compare equal to our insert key, which
64  * by the time we actually do the insert will all be deleted.
65  */
66
67 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
68 {
69         struct bkey_packed *_k, *_n;
70         struct bkey k, n;
71         char buf[120];
72
73         if (!i->u64s)
74                 return;
75
76         for (_k = i->start, k = bkey_unpack_key(b, _k);
77              _k < vstruct_last(i);
78              _k = _n, k = n) {
79                 _n = bkey_next_skip_noops(_k, vstruct_last(i));
80
81                 bch2_bkey_to_text(&PBUF(buf), &k);
82                 printk(KERN_ERR "block %u key %5zu: %s\n", set,
83                        _k->_data - i->_data, buf);
84
85                 if (_n == vstruct_last(i))
86                         continue;
87
88                 n = bkey_unpack_key(b, _n);
89
90                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
91                         printk(KERN_ERR "Key skipped backwards\n");
92                         continue;
93                 }
94
95                 /*
96                  * Weird check for duplicate non extent keys: extents are
97                  * deleted iff they have 0 size, so if it has zero size and it's
98                  * not deleted these aren't extents:
99                  */
100                 if (((!k.size && !bkey_deleted(&k)) ||
101                      (!n.size && !bkey_deleted(&n))) &&
102                     !bkey_deleted(&k) &&
103                     !bkey_cmp(n.p, k.p))
104                         printk(KERN_ERR "Duplicate keys\n");
105         }
106 }
107
108 void bch2_dump_btree_node(struct btree *b)
109 {
110         struct bset_tree *t;
111
112         console_lock();
113         for_each_bset(b, t)
114                 bch2_dump_bset(b, bset(b, t), t - b->set);
115         console_unlock();
116 }
117
118 void bch2_dump_btree_node_iter(struct btree *b,
119                               struct btree_node_iter *iter)
120 {
121         struct btree_node_iter_set *set;
122
123         printk(KERN_ERR "btree node iter with %u/%u sets:\n",
124                __btree_node_iter_used(iter), b->nsets);
125
126         btree_node_iter_for_each(iter, set) {
127                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
128                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
129                 struct bkey uk = bkey_unpack_key(b, k);
130                 char buf[100];
131
132                 bch2_bkey_to_text(&PBUF(buf), &uk);
133                 printk(KERN_ERR "set %zu key %u: %s\n",
134                        t - b->set, set->k, buf);
135         }
136 }
137
138 #ifdef CONFIG_BCACHEFS_DEBUG
139
140 void __bch2_verify_btree_nr_keys(struct btree *b)
141 {
142         struct bset_tree *t;
143         struct bkey_packed *k;
144         struct btree_nr_keys nr = { 0 };
145
146         for_each_bset(b, t)
147                 bset_tree_for_each_key(b, t, k)
148                         if (!bkey_whiteout(k))
149                                 btree_keys_account_key_add(&nr, t - b->set, k);
150
151         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
152 }
153
154 static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
155                                             struct btree *b)
156 {
157         struct btree_node_iter iter = *_iter;
158         const struct bkey_packed *k, *n;
159
160         k = bch2_btree_node_iter_peek_all(&iter, b);
161         __bch2_btree_node_iter_advance(&iter, b);
162         n = bch2_btree_node_iter_peek_all(&iter, b);
163
164         bkey_unpack_key(b, k);
165
166         if (n &&
167             bkey_iter_cmp(b, k, n) > 0) {
168                 struct btree_node_iter_set *set;
169                 struct bkey ku = bkey_unpack_key(b, k);
170                 struct bkey nu = bkey_unpack_key(b, n);
171                 char buf1[80], buf2[80];
172
173                 bch2_dump_btree_node(b);
174                 bch2_bkey_to_text(&PBUF(buf1), &ku);
175                 bch2_bkey_to_text(&PBUF(buf2), &nu);
176                 printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
177                        buf1, buf2);
178                 printk(KERN_ERR "iter was:");
179
180                 btree_node_iter_for_each(_iter, set) {
181                         struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
182                         struct bset_tree *t = bch2_bkey_to_bset(b, k);
183                         printk(" [%zi %zi]", t - b->set,
184                                k->_data - bset(b, t)->_data);
185                 }
186                 panic("\n");
187         }
188 }
189
190 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
191                                  struct btree *b)
192 {
193         struct btree_node_iter_set *set, *s2;
194         struct bkey_packed *k, *p;
195         struct bset_tree *t;
196
197         if (bch2_btree_node_iter_end(iter))
198                 return;
199
200         /* Verify no duplicates: */
201         btree_node_iter_for_each(iter, set)
202                 btree_node_iter_for_each(iter, s2)
203                         BUG_ON(set != s2 && set->end == s2->end);
204
205         /* Verify that set->end is correct: */
206         btree_node_iter_for_each(iter, set) {
207                 for_each_bset(b, t)
208                         if (set->end == t->end_offset)
209                                 goto found;
210                 BUG();
211 found:
212                 BUG_ON(set->k < btree_bkey_first_offset(t) ||
213                        set->k >= t->end_offset);
214         }
215
216         /* Verify iterator is sorted: */
217         btree_node_iter_for_each(iter, set)
218                 BUG_ON(set != iter->data &&
219                        btree_node_iter_cmp(b, set[-1], set[0]) > 0);
220
221         k = bch2_btree_node_iter_peek_all(iter, b);
222
223         for_each_bset(b, t) {
224                 if (iter->data[0].end == t->end_offset)
225                         continue;
226
227                 p = bch2_bkey_prev_all(b, t,
228                         bch2_btree_node_iter_bset_pos(iter, b, t));
229
230                 BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
231         }
232 }
233
234 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
235                             struct bkey_packed *insert, unsigned clobber_u64s)
236 {
237         struct bset_tree *t = bch2_bkey_to_bset(b, where);
238         struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
239         struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
240 #if 0
241         BUG_ON(prev &&
242                bkey_iter_cmp(b, prev, insert) > 0);
243 #else
244         if (prev &&
245             bkey_iter_cmp(b, prev, insert) > 0) {
246                 struct bkey k1 = bkey_unpack_key(b, prev);
247                 struct bkey k2 = bkey_unpack_key(b, insert);
248                 char buf1[100];
249                 char buf2[100];
250
251                 bch2_dump_btree_node(b);
252                 bch2_bkey_to_text(&PBUF(buf1), &k1);
253                 bch2_bkey_to_text(&PBUF(buf2), &k2);
254
255                 panic("prev > insert:\n"
256                       "prev    key %s\n"
257                       "insert  key %s\n",
258                       buf1, buf2);
259         }
260 #endif
261 #if 0
262         BUG_ON(next != btree_bkey_last(b, t) &&
263                bkey_iter_cmp(b, insert, next) > 0);
264 #else
265         if (next != btree_bkey_last(b, t) &&
266             bkey_iter_cmp(b, insert, next) > 0) {
267                 struct bkey k1 = bkey_unpack_key(b, insert);
268                 struct bkey k2 = bkey_unpack_key(b, next);
269                 char buf1[100];
270                 char buf2[100];
271
272                 bch2_dump_btree_node(b);
273                 bch2_bkey_to_text(&PBUF(buf1), &k1);
274                 bch2_bkey_to_text(&PBUF(buf2), &k2);
275
276                 panic("insert > next:\n"
277                       "insert  key %s\n"
278                       "next    key %s\n",
279                       buf1, buf2);
280         }
281 #endif
282 }
283
284 #else
285
286 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
287                                                    struct btree *b) {}
288
289 #endif
290
291 /* Auxiliary search trees */
292
293 #define BFLOAT_FAILED_UNPACKED  U8_MAX
294 #define BFLOAT_FAILED           U8_MAX
295
296 struct bkey_float {
297         u8              exponent;
298         u8              key_offset;
299         u16             mantissa;
300 };
301 #define BKEY_MANTISSA_BITS      16
302
303 static unsigned bkey_float_byte_offset(unsigned idx)
304 {
305         return idx * sizeof(struct bkey_float);
306 }
307
308 struct ro_aux_tree {
309         struct bkey_float       f[0];
310 };
311
312 struct rw_aux_tree {
313         u16             offset;
314         struct bpos     k;
315 };
316
317 /*
318  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
319  * it used to be 64, but I realized the lookup code would touch slightly less
320  * memory if it was 128.
321  *
322  * It definites the number of bytes (in struct bset) per struct bkey_float in
323  * the auxiliar search tree - when we're done searching the bset_float tree we
324  * have this many bytes left that we do a linear search over.
325  *
326  * Since (after level 5) every level of the bset_tree is on a new cacheline,
327  * we're touching one fewer cacheline in the bset tree in exchange for one more
328  * cacheline in the linear search - but the linear search might stop before it
329  * gets to the second cacheline.
330  */
331
332 #define BSET_CACHELINE          128
333
334 /* Space required for the btree node keys */
335 static inline size_t btree_keys_bytes(struct btree *b)
336 {
337         return PAGE_SIZE << b->page_order;
338 }
339
340 static inline size_t btree_keys_cachelines(struct btree *b)
341 {
342         return btree_keys_bytes(b) / BSET_CACHELINE;
343 }
344
345 static inline size_t btree_aux_data_bytes(struct btree *b)
346 {
347         return btree_keys_cachelines(b) * 8;
348 }
349
350 static inline size_t btree_aux_data_u64s(struct btree *b)
351 {
352         return btree_aux_data_bytes(b) / sizeof(u64);
353 }
354
355 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
356 {
357         BUG_ON(t->aux_data_offset == U16_MAX);
358
359         switch (bset_aux_tree_type(t)) {
360         case BSET_NO_AUX_TREE:
361                 return t->aux_data_offset;
362         case BSET_RO_AUX_TREE:
363                 return t->aux_data_offset +
364                         DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
365                                      t->size * sizeof(u8), 8);
366         case BSET_RW_AUX_TREE:
367                 return t->aux_data_offset +
368                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
369         default:
370                 BUG();
371         }
372 }
373
374 static unsigned bset_aux_tree_buf_start(const struct btree *b,
375                                         const struct bset_tree *t)
376 {
377         return t == b->set
378                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
379                 : bset_aux_tree_buf_end(t - 1);
380 }
381
382 static void *__aux_tree_base(const struct btree *b,
383                              const struct bset_tree *t)
384 {
385         return b->aux_data + t->aux_data_offset * 8;
386 }
387
388 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
389                                             const struct bset_tree *t)
390 {
391         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
392
393         return __aux_tree_base(b, t);
394 }
395
396 static u8 *ro_aux_tree_prev(const struct btree *b,
397                             const struct bset_tree *t)
398 {
399         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
400
401         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
402 }
403
404 static struct bkey_float *bkey_float(const struct btree *b,
405                                      const struct bset_tree *t,
406                                      unsigned idx)
407 {
408         return ro_aux_tree_base(b, t)->f + idx;
409 }
410
411 static void bset_aux_tree_verify(struct btree *b)
412 {
413 #ifdef CONFIG_BCACHEFS_DEBUG
414         struct bset_tree *t;
415
416         for_each_bset(b, t) {
417                 if (t->aux_data_offset == U16_MAX)
418                         continue;
419
420                 BUG_ON(t != b->set &&
421                        t[-1].aux_data_offset == U16_MAX);
422
423                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
424                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
425                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
426         }
427 #endif
428 }
429
430 /* Memory allocation */
431
432 void bch2_btree_keys_free(struct btree *b)
433 {
434         vfree(b->aux_data);
435         b->aux_data = NULL;
436 }
437
438 #ifndef PAGE_KERNEL_EXEC
439 # define PAGE_KERNEL_EXEC PAGE_KERNEL
440 #endif
441
442 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
443 {
444         b->page_order   = page_order;
445         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
446                                     PAGE_KERNEL_EXEC);
447         if (!b->aux_data)
448                 return -ENOMEM;
449
450         return 0;
451 }
452
453 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
454 {
455         unsigned i;
456
457         b->nsets                = 0;
458         memset(&b->nr, 0, sizeof(b->nr));
459 #ifdef CONFIG_BCACHEFS_DEBUG
460         b->expensive_debug_checks = expensive_debug_checks;
461 #endif
462         for (i = 0; i < MAX_BSETS; i++)
463                 b->set[i].data_offset = U16_MAX;
464
465         bch2_bset_set_no_aux_tree(b, b->set);
466 }
467
468 /* Binary tree stuff for auxiliary search trees */
469
470 /*
471  * Cacheline/offset <-> bkey pointer arithmetic:
472  *
473  * t->tree is a binary search tree in an array; each node corresponds to a key
474  * in one cacheline in t->set (BSET_CACHELINE bytes).
475  *
476  * This means we don't have to store the full index of the key that a node in
477  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
478  * then bkey_float->m gives us the offset within that cacheline, in units of 8
479  * bytes.
480  *
481  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
482  * make this work.
483  *
484  * To construct the bfloat for an arbitrary key we need to know what the key
485  * immediately preceding it is: we have to check if the two keys differ in the
486  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
487  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
488  */
489
490 static inline void *bset_cacheline(const struct btree *b,
491                                    const struct bset_tree *t,
492                                    unsigned cacheline)
493 {
494         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
495                                    L1_CACHE_BYTES) +
496                 cacheline * BSET_CACHELINE;
497 }
498
499 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
500                                              const struct bset_tree *t,
501                                              unsigned cacheline,
502                                              unsigned offset)
503 {
504         return bset_cacheline(b, t, cacheline) + offset * 8;
505 }
506
507 static unsigned bkey_to_cacheline(const struct btree *b,
508                                   const struct bset_tree *t,
509                                   const struct bkey_packed *k)
510 {
511         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
512 }
513
514 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
515                                           const struct bset_tree *t,
516                                           unsigned cacheline,
517                                           const struct bkey_packed *k)
518 {
519         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
520 }
521
522 static unsigned bkey_to_cacheline_offset(const struct btree *b,
523                                          const struct bset_tree *t,
524                                          unsigned cacheline,
525                                          const struct bkey_packed *k)
526 {
527         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
528
529         EBUG_ON(m > U8_MAX);
530         return m;
531 }
532
533 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
534                                                const struct bset_tree *t,
535                                                unsigned j)
536 {
537         return cacheline_to_bkey(b, t,
538                         __eytzinger1_to_inorder(j, t->size, t->extra),
539                         bkey_float(b, t, j)->key_offset);
540 }
541
542 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
543                                              const struct bset_tree *t,
544                                              unsigned j)
545 {
546         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
547
548         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
549 }
550
551 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
552                                        const struct bset_tree *t)
553 {
554         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
555
556         return __aux_tree_base(b, t);
557 }
558
559 /*
560  * For the write set - the one we're currently inserting keys into - we don't
561  * maintain a full search tree, we just keep a simple lookup table in t->prev.
562  */
563 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
564                                           struct bset_tree *t,
565                                           unsigned j)
566 {
567         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
568 }
569
570 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
571                             unsigned j, struct bkey_packed *k)
572 {
573         EBUG_ON(k >= btree_bkey_last(b, t));
574
575         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
576                 .offset = __btree_node_key_to_offset(b, k),
577                 .k      = bkey_unpack_pos(b, k),
578         };
579 }
580
581 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
582                                         struct bset_tree *t)
583 {
584         struct bkey_packed *k = btree_bkey_first(b, t);
585         unsigned j = 0;
586
587         if (!btree_keys_expensive_checks(b))
588                 return;
589
590         BUG_ON(bset_has_ro_aux_tree(t));
591
592         if (!bset_has_rw_aux_tree(t))
593                 return;
594
595         BUG_ON(t->size < 1);
596         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
597
598         goto start;
599         while (1) {
600                 if (rw_aux_to_bkey(b, t, j) == k) {
601                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
602                                         bkey_unpack_pos(b, k)));
603 start:
604                         if (++j == t->size)
605                                 break;
606
607                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
608                                rw_aux_tree(b, t)[j - 1].offset);
609                 }
610
611                 k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
612                 BUG_ON(k >= btree_bkey_last(b, t));
613         }
614 }
615
616 /* returns idx of first entry >= offset: */
617 static unsigned rw_aux_tree_bsearch(struct btree *b,
618                                     struct bset_tree *t,
619                                     unsigned offset)
620 {
621         unsigned bset_offs = offset - btree_bkey_first_offset(t);
622         unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
623         unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
624
625         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
626         EBUG_ON(!t->size);
627         EBUG_ON(idx > t->size);
628
629         while (idx < t->size &&
630                rw_aux_tree(b, t)[idx].offset < offset)
631                 idx++;
632
633         while (idx &&
634                rw_aux_tree(b, t)[idx - 1].offset >= offset)
635                 idx--;
636
637         EBUG_ON(idx < t->size &&
638                 rw_aux_tree(b, t)[idx].offset < offset);
639         EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
640         EBUG_ON(idx + 1 < t->size &&
641                 rw_aux_tree(b, t)[idx].offset ==
642                 rw_aux_tree(b, t)[idx + 1].offset);
643
644         return idx;
645 }
646
647 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
648                                      const struct bkey_float *f,
649                                      unsigned idx)
650 {
651         u64 v;
652
653         EBUG_ON(!bkey_packed(k));
654
655         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
656
657         /*
658          * In little endian, we're shifting off low bits (and then the bits we
659          * want are at the low end), in big endian we're shifting off high bits
660          * (and then the bits we want are at the high end, so we shift them
661          * back down):
662          */
663 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
664         v >>= f->exponent & 7;
665 #else
666         v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
667 #endif
668         return (u16) v;
669 }
670
671 static void make_bfloat(struct btree *b, struct bset_tree *t,
672                         unsigned j,
673                         struct bkey_packed *min_key,
674                         struct bkey_packed *max_key)
675 {
676         struct bkey_float *f = bkey_float(b, t, j);
677         struct bkey_packed *m = tree_to_bkey(b, t, j);
678         struct bkey_packed *l, *r;
679         unsigned mantissa;
680         int shift, exponent, high_bit;
681
682         if (is_power_of_2(j)) {
683                 l = min_key;
684
685                 if (!l->u64s) {
686                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
687                                 struct bkey_i tmp;
688
689                                 bkey_init(&tmp.k);
690                                 tmp.k.p = b->data->min_key;
691                                 bkey_copy(l, &tmp);
692                         }
693                 }
694         } else {
695                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
696
697                 EBUG_ON(m < l);
698         }
699
700         if (is_power_of_2(j + 1)) {
701                 r = max_key;
702
703                 if (!r->u64s) {
704                         if (!bkey_pack_pos(r, t->max_key, b)) {
705                                 struct bkey_i tmp;
706
707                                 bkey_init(&tmp.k);
708                                 tmp.k.p = t->max_key;
709                                 bkey_copy(r, &tmp);
710                         }
711                 }
712         } else {
713                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
714
715                 EBUG_ON(m > r);
716         }
717
718         /*
719          * for failed bfloats, the lookup code falls back to comparing against
720          * the original key.
721          */
722
723         if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
724             !b->nr_key_bits) {
725                 f->exponent = BFLOAT_FAILED_UNPACKED;
726                 return;
727         }
728
729         /*
730          * The greatest differing bit of l and r is the first bit we must
731          * include in the bfloat mantissa we're creating in order to do
732          * comparisons - that bit always becomes the high bit of
733          * bfloat->mantissa, and thus the exponent we're calculating here is
734          * the position of what will become the low bit in bfloat->mantissa:
735          *
736          * Note that this may be negative - we may be running off the low end
737          * of the key: we handle this later:
738          */
739         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
740                        min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
741         exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
742
743         /*
744          * Then we calculate the actual shift value, from the start of the key
745          * (k->_data), to get the key bits starting at exponent:
746          */
747 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
748         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
749
750         EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
751 #else
752         shift = high_bit_offset +
753                 b->nr_key_bits -
754                 exponent -
755                 BKEY_MANTISSA_BITS;
756
757         EBUG_ON(shift < KEY_PACKED_BITS_START);
758 #endif
759         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
760
761         f->exponent = shift;
762         mantissa = bkey_mantissa(m, f, j);
763
764         /*
765          * If we've got garbage bits, set them to all 1s - it's legal for the
766          * bfloat to compare larger than the original key, but not smaller:
767          */
768         if (exponent < 0)
769                 mantissa |= ~(~0U << -exponent);
770
771         f->mantissa = mantissa;
772 }
773
774 /* bytes remaining - only valid for last bset: */
775 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
776 {
777         bset_aux_tree_verify(b);
778
779         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
780 }
781
782 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
783 {
784         return __bset_tree_capacity(b, t) /
785                 (sizeof(struct bkey_float) + sizeof(u8));
786 }
787
788 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
789 {
790         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
791 }
792
793 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
794 {
795         struct bkey_packed *k;
796
797         t->size = 1;
798         t->extra = BSET_RW_AUX_TREE_VAL;
799         rw_aux_tree(b, t)[0].offset =
800                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
801
802         bset_tree_for_each_key(b, t, k) {
803                 if (t->size == bset_rw_tree_capacity(b, t))
804                         break;
805
806                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
807                     L1_CACHE_BYTES)
808                         rw_aux_tree_set(b, t, t->size++, k);
809         }
810 }
811
812 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
813 {
814         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
815         struct bkey_packed min_key, max_key;
816         unsigned j, cacheline = 1;
817
818         /* signal to make_bfloat() that they're uninitialized: */
819         min_key.u64s = max_key.u64s = 0;
820
821         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
822                       bset_ro_tree_capacity(b, t));
823 retry:
824         if (t->size < 2) {
825                 t->size = 0;
826                 t->extra = BSET_NO_AUX_TREE_VAL;
827                 return;
828         }
829
830         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
831
832         /* First we figure out where the first key in each cacheline is */
833         eytzinger1_for_each(j, t->size) {
834                 while (bkey_to_cacheline(b, t, k) < cacheline)
835                         prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
836
837                 if (k >= btree_bkey_last(b, t)) {
838                         /* XXX: this path sucks */
839                         t->size--;
840                         goto retry;
841                 }
842
843                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
844                 bkey_float(b, t, j)->key_offset =
845                         bkey_to_cacheline_offset(b, t, cacheline++, k);
846
847                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
848                 EBUG_ON(tree_to_bkey(b, t, j) != k);
849         }
850
851         while (k != btree_bkey_last(b, t))
852                 prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
853
854         t->max_key = bkey_unpack_pos(b, prev);
855
856         /* Then we build the tree */
857         eytzinger1_for_each(j, t->size)
858                 make_bfloat(b, t, j, &min_key, &max_key);
859 }
860
861 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
862 {
863         struct bset_tree *i;
864
865         for (i = b->set; i != t; i++)
866                 BUG_ON(bset_has_rw_aux_tree(i));
867
868         bch2_bset_set_no_aux_tree(b, t);
869
870         /* round up to next cacheline: */
871         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
872                                       SMP_CACHE_BYTES / sizeof(u64));
873
874         bset_aux_tree_verify(b);
875 }
876
877 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
878                              bool writeable)
879 {
880         if (writeable
881             ? bset_has_rw_aux_tree(t)
882             : bset_has_ro_aux_tree(t))
883                 return;
884
885         bset_alloc_tree(b, t);
886
887         if (!__bset_tree_capacity(b, t))
888                 return;
889
890         if (writeable)
891                 __build_rw_aux_tree(b, t);
892         else
893                 __build_ro_aux_tree(b, t);
894
895         bset_aux_tree_verify(b);
896 }
897
898 void bch2_bset_init_first(struct btree *b, struct bset *i)
899 {
900         struct bset_tree *t;
901
902         BUG_ON(b->nsets);
903
904         memset(i, 0, sizeof(*i));
905         get_random_bytes(&i->seq, sizeof(i->seq));
906         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
907
908         t = &b->set[b->nsets++];
909         set_btree_bset(b, t, i);
910 }
911
912 void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
913                          struct btree_node_entry *bne)
914 {
915         struct bset *i = &bne->keys;
916         struct bset_tree *t;
917
918         BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
919         BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
920         BUG_ON(b->nsets >= MAX_BSETS);
921
922         memset(i, 0, sizeof(*i));
923         i->seq = btree_bset_first(b)->seq;
924         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
925
926         t = &b->set[b->nsets++];
927         set_btree_bset(b, t, i);
928 }
929
930 /*
931  * find _some_ key in the same bset as @k that precedes @k - not necessarily the
932  * immediate predecessor:
933  */
934 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
935                                        struct bkey_packed *k)
936 {
937         struct bkey_packed *p;
938         unsigned offset;
939         int j;
940
941         EBUG_ON(k < btree_bkey_first(b, t) ||
942                 k > btree_bkey_last(b, t));
943
944         if (k == btree_bkey_first(b, t))
945                 return NULL;
946
947         switch (bset_aux_tree_type(t)) {
948         case BSET_NO_AUX_TREE:
949                 p = btree_bkey_first(b, t);
950                 break;
951         case BSET_RO_AUX_TREE:
952                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
953
954                 do {
955                         p = j ? tree_to_bkey(b, t,
956                                         __inorder_to_eytzinger1(j--,
957                                                         t->size, t->extra))
958                               : btree_bkey_first(b, t);
959                 } while (p >= k);
960                 break;
961         case BSET_RW_AUX_TREE:
962                 offset = __btree_node_key_to_offset(b, k);
963                 j = rw_aux_tree_bsearch(b, t, offset);
964                 p = j ? rw_aux_to_bkey(b, t, j - 1)
965                       : btree_bkey_first(b, t);
966                 break;
967         }
968
969         return p;
970 }
971
972 struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
973                                           struct bset_tree *t,
974                                           struct bkey_packed *k,
975                                           unsigned min_key_type)
976 {
977         struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
978
979         while ((p = __bkey_prev(b, t, k)) && !ret) {
980                 for (i = p; i != k; i = bkey_next_skip_noops(i, k))
981                         if (i->type >= min_key_type)
982                                 ret = i;
983
984                 k = p;
985         }
986
987         if (btree_keys_expensive_checks(b)) {
988                 BUG_ON(ret >= orig_k);
989
990                 for (i = ret
991                         ? bkey_next_skip_noops(ret, orig_k)
992                         : btree_bkey_first(b, t);
993                      i != orig_k;
994                      i = bkey_next_skip_noops(i, orig_k))
995                         BUG_ON(i->type >= min_key_type);
996         }
997
998         return ret;
999 }
1000
1001 /* Insert */
1002
1003 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1004                                             struct bset_tree *t,
1005                                             struct bkey_packed *k)
1006 {
1007         unsigned offset = __btree_node_key_to_offset(b, k);
1008         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1009
1010         if (j < t->size &&
1011             rw_aux_tree(b, t)[j].offset == offset)
1012                 rw_aux_tree_set(b, t, j, k);
1013
1014         bch2_bset_verify_rw_aux_tree(b, t);
1015 }
1016
1017 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1018                                             struct bset_tree *t,
1019                                             struct bkey_packed *k)
1020 {
1021         struct bkey_packed min_key, max_key;
1022         unsigned inorder, j;
1023
1024         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1025
1026         /* signal to make_bfloat() that they're uninitialized: */
1027         min_key.u64s = max_key.u64s = 0;
1028
1029         if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
1030                 t->max_key = bkey_unpack_pos(b, k);
1031
1032                 for (j = 1; j < t->size; j = j * 2 + 1)
1033                         make_bfloat(b, t, j, &min_key, &max_key);
1034         }
1035
1036         inorder = bkey_to_cacheline(b, t, k);
1037
1038         if (inorder &&
1039             inorder < t->size) {
1040                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1041
1042                 if (k == tree_to_bkey(b, t, j)) {
1043                         /* Fix the node this key corresponds to */
1044                         make_bfloat(b, t, j, &min_key, &max_key);
1045
1046                         /* Children for which this key is the right boundary */
1047                         for (j = eytzinger1_left_child(j);
1048                              j < t->size;
1049                              j = eytzinger1_right_child(j))
1050                                 make_bfloat(b, t, j, &min_key, &max_key);
1051                 }
1052         }
1053
1054         if (inorder + 1 < t->size) {
1055                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1056
1057                 if (k == tree_to_prev_bkey(b, t, j)) {
1058                         make_bfloat(b, t, j, &min_key, &max_key);
1059
1060                         /* Children for which this key is the left boundary */
1061                         for (j = eytzinger1_right_child(j);
1062                              j < t->size;
1063                              j = eytzinger1_left_child(j))
1064                                 make_bfloat(b, t, j, &min_key, &max_key);
1065                 }
1066         }
1067 }
1068
1069 /**
1070  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1071  * modified, fix any auxiliary search tree by remaking all the nodes in the
1072  * auxiliary search tree that @k corresponds to
1073  */
1074 void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
1075 {
1076         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1077
1078         switch (bset_aux_tree_type(t)) {
1079         case BSET_NO_AUX_TREE:
1080                 break;
1081         case BSET_RO_AUX_TREE:
1082                 ro_aux_tree_fix_invalidated_key(b, t, k);
1083                 break;
1084         case BSET_RW_AUX_TREE:
1085                 rw_aux_tree_fix_invalidated_key(b, t, k);
1086                 break;
1087         }
1088 }
1089
1090 static void bch2_bset_fix_lookup_table(struct btree *b,
1091                                        struct bset_tree *t,
1092                                        struct bkey_packed *_where,
1093                                        unsigned clobber_u64s,
1094                                        unsigned new_u64s)
1095 {
1096         int shift = new_u64s - clobber_u64s;
1097         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1098
1099         EBUG_ON(bset_has_ro_aux_tree(t));
1100
1101         if (!bset_has_rw_aux_tree(t))
1102                 return;
1103
1104         /* returns first entry >= where */
1105         l = rw_aux_tree_bsearch(b, t, where);
1106
1107         if (!l) /* never delete first entry */
1108                 l++;
1109         else if (l < t->size &&
1110                  where < t->end_offset &&
1111                  rw_aux_tree(b, t)[l].offset == where)
1112                 rw_aux_tree_set(b, t, l++, _where);
1113
1114         /* l now > where */
1115
1116         for (j = l;
1117              j < t->size &&
1118              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1119              j++)
1120                 ;
1121
1122         if (j < t->size &&
1123             rw_aux_tree(b, t)[j].offset + shift ==
1124             rw_aux_tree(b, t)[l - 1].offset)
1125                 j++;
1126
1127         memmove(&rw_aux_tree(b, t)[l],
1128                 &rw_aux_tree(b, t)[j],
1129                 (void *) &rw_aux_tree(b, t)[t->size] -
1130                 (void *) &rw_aux_tree(b, t)[j]);
1131         t->size -= j - l;
1132
1133         for (j = l; j < t->size; j++)
1134                rw_aux_tree(b, t)[j].offset += shift;
1135
1136         EBUG_ON(l < t->size &&
1137                 rw_aux_tree(b, t)[l].offset ==
1138                 rw_aux_tree(b, t)[l - 1].offset);
1139
1140         if (t->size < bset_rw_tree_capacity(b, t) &&
1141             (l < t->size
1142              ? rw_aux_tree(b, t)[l].offset
1143              : t->end_offset) -
1144             rw_aux_tree(b, t)[l - 1].offset >
1145             L1_CACHE_BYTES / sizeof(u64)) {
1146                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1147                 struct bkey_packed *end = l < t->size
1148                         ? rw_aux_to_bkey(b, t, l)
1149                         : btree_bkey_last(b, t);
1150                 struct bkey_packed *k = start;
1151
1152                 while (1) {
1153                         k = bkey_next_skip_noops(k, end);
1154                         if (k == end)
1155                                 break;
1156
1157                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1158                                 memmove(&rw_aux_tree(b, t)[l + 1],
1159                                         &rw_aux_tree(b, t)[l],
1160                                         (void *) &rw_aux_tree(b, t)[t->size] -
1161                                         (void *) &rw_aux_tree(b, t)[l]);
1162                                 t->size++;
1163                                 rw_aux_tree_set(b, t, l, k);
1164                                 break;
1165                         }
1166                 }
1167         }
1168
1169         bch2_bset_verify_rw_aux_tree(b, t);
1170         bset_aux_tree_verify(b);
1171 }
1172
1173 void bch2_bset_insert(struct btree *b,
1174                       struct btree_node_iter *iter,
1175                       struct bkey_packed *where,
1176                       struct bkey_i *insert,
1177                       unsigned clobber_u64s)
1178 {
1179         struct bkey_format *f = &b->format;
1180         struct bset_tree *t = bset_tree_last(b);
1181         struct bkey_packed packed, *src = bkey_to_packed(insert);
1182
1183         bch2_bset_verify_rw_aux_tree(b, t);
1184         bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
1185
1186         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1187                 src = &packed;
1188
1189         if (!bkey_whiteout(&insert->k))
1190                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1191
1192         if (src->u64s != clobber_u64s) {
1193                 u64 *src_p = where->_data + clobber_u64s;
1194                 u64 *dst_p = where->_data + src->u64s;
1195
1196                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1197                         (int) clobber_u64s - src->u64s);
1198
1199                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1200                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1201                 set_btree_bset_end(b, t);
1202         }
1203
1204         memcpy_u64s(where, src,
1205                     bkeyp_key_u64s(f, src));
1206         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1207                     bkeyp_val_u64s(f, src));
1208
1209         if (src->u64s != clobber_u64s)
1210                 bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1211
1212         bch2_verify_btree_nr_keys(b);
1213 }
1214
1215 void bch2_bset_delete(struct btree *b,
1216                       struct bkey_packed *where,
1217                       unsigned clobber_u64s)
1218 {
1219         struct bset_tree *t = bset_tree_last(b);
1220         u64 *src_p = where->_data + clobber_u64s;
1221         u64 *dst_p = where->_data;
1222
1223         bch2_bset_verify_rw_aux_tree(b, t);
1224
1225         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1226
1227         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1228         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1229         set_btree_bset_end(b, t);
1230
1231         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1232 }
1233
1234 /* Lookup */
1235
1236 __flatten
1237 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1238                                 struct bset_tree *t,
1239                                 struct bpos *search,
1240                                 const struct bkey_packed *packed_search)
1241 {
1242         unsigned l = 0, r = t->size;
1243
1244         while (l + 1 != r) {
1245                 unsigned m = (l + r) >> 1;
1246
1247                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
1248                         l = m;
1249                 else
1250                         r = m;
1251         }
1252
1253         return rw_aux_to_bkey(b, t, l);
1254 }
1255
1256 static inline void prefetch_four_cachelines(void *p)
1257 {
1258 #ifdef CONFIG_X86_64
1259         asm(".intel_syntax noprefix;"
1260             "prefetcht0 [%0 - 127 + 64 * 0];"
1261             "prefetcht0 [%0 - 127 + 64 * 1];"
1262             "prefetcht0 [%0 - 127 + 64 * 2];"
1263             "prefetcht0 [%0 - 127 + 64 * 3];"
1264             ".att_syntax prefix;"
1265             :
1266             : "r" (p + 127));
1267 #else
1268         prefetch(p + L1_CACHE_BYTES * 0);
1269         prefetch(p + L1_CACHE_BYTES * 1);
1270         prefetch(p + L1_CACHE_BYTES * 2);
1271         prefetch(p + L1_CACHE_BYTES * 3);
1272 #endif
1273 }
1274
1275 static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
1276                                               const struct bkey_float *f,
1277                                               unsigned idx)
1278 {
1279 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1280         unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
1281
1282         return f->exponent > key_bits_start;
1283 #else
1284         unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
1285
1286         return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
1287 #endif
1288 }
1289
1290 __flatten
1291 static struct bkey_packed *bset_search_tree(const struct btree *b,
1292                                 struct bset_tree *t,
1293                                 struct bpos *search,
1294                                 const struct bkey_packed *packed_search)
1295 {
1296         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1297         struct bkey_float *f;
1298         struct bkey_packed *k;
1299         unsigned inorder, n = 1, l, r;
1300         int cmp;
1301
1302         do {
1303                 if (likely(n << 4 < t->size))
1304                         prefetch(&base->f[n << 4]);
1305
1306                 f = &base->f[n];
1307
1308                 if (!unlikely(packed_search))
1309                         goto slowpath;
1310                 if (unlikely(f->exponent >= BFLOAT_FAILED))
1311                         goto slowpath;
1312
1313                 l = f->mantissa;
1314                 r = bkey_mantissa(packed_search, f, n);
1315
1316                 if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
1317                         goto slowpath;
1318
1319                 n = n * 2 + (l < r);
1320                 continue;
1321 slowpath:
1322                 k = tree_to_bkey(b, t, n);
1323                 cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
1324                 if (!cmp)
1325                         return k;
1326
1327                 n = n * 2 + (cmp < 0);
1328         } while (n < t->size);
1329
1330         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1331
1332         /*
1333          * n would have been the node we recursed to - the low bit tells us if
1334          * we recursed left or recursed right.
1335          */
1336         if (likely(!(n & 1))) {
1337                 --inorder;
1338                 if (unlikely(!inorder))
1339                         return btree_bkey_first(b, t);
1340
1341                 f = &base->f[eytzinger1_prev(n >> 1, t->size)];
1342         }
1343
1344         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1345 }
1346
1347 static __always_inline __flatten
1348 struct bkey_packed *__bch2_bset_search(struct btree *b,
1349                                 struct bset_tree *t,
1350                                 struct bpos *search,
1351                                 const struct bkey_packed *lossy_packed_search)
1352 {
1353
1354         /*
1355          * First, we search for a cacheline, then lastly we do a linear search
1356          * within that cacheline.
1357          *
1358          * To search for the cacheline, there's three different possibilities:
1359          *  * The set is too small to have a search tree, so we just do a linear
1360          *    search over the whole set.
1361          *  * The set is the one we're currently inserting into; keeping a full
1362          *    auxiliary search tree up to date would be too expensive, so we
1363          *    use a much simpler lookup table to do a binary search -
1364          *    bset_search_write_set().
1365          *  * Or we use the auxiliary search tree we constructed earlier -
1366          *    bset_search_tree()
1367          */
1368
1369         switch (bset_aux_tree_type(t)) {
1370         case BSET_NO_AUX_TREE:
1371                 return btree_bkey_first(b, t);
1372         case BSET_RW_AUX_TREE:
1373                 return bset_search_write_set(b, t, search, lossy_packed_search);
1374         case BSET_RO_AUX_TREE:
1375                 /*
1376                  * Each node in the auxiliary search tree covers a certain range
1377                  * of bits, and keys above and below the set it covers might
1378                  * differ outside those bits - so we have to special case the
1379                  * start and end - handle that here:
1380                  */
1381
1382                 if (bkey_cmp(*search, t->max_key) > 0)
1383                         return btree_bkey_last(b, t);
1384
1385                 return bset_search_tree(b, t, search, lossy_packed_search);
1386         default:
1387                 unreachable();
1388         }
1389 }
1390
1391 static __always_inline __flatten
1392 struct bkey_packed *bch2_bset_search_linear(struct btree *b,
1393                                 struct bset_tree *t,
1394                                 struct bpos *search,
1395                                 struct bkey_packed *packed_search,
1396                                 const struct bkey_packed *lossy_packed_search,
1397                                 struct bkey_packed *m)
1398 {
1399         if (lossy_packed_search)
1400                 while (m != btree_bkey_last(b, t) &&
1401                        bkey_iter_cmp_p_or_unp(b, m,
1402                                         lossy_packed_search, search) < 0)
1403                         m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
1404
1405         if (!packed_search)
1406                 while (m != btree_bkey_last(b, t) &&
1407                        bkey_iter_pos_cmp(b, m, search) < 0)
1408                         m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
1409
1410         if (btree_keys_expensive_checks(b)) {
1411                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1412
1413                 BUG_ON(prev &&
1414                        bkey_iter_cmp_p_or_unp(b, prev,
1415                                         packed_search, search) >= 0);
1416         }
1417
1418         return m;
1419 }
1420
1421 /*
1422  * Returns the first key greater than or equal to @search
1423  */
1424 static __always_inline __flatten
1425 struct bkey_packed *bch2_bset_search(struct btree *b,
1426                                 struct bset_tree *t,
1427                                 struct bpos *search,
1428                                 struct bkey_packed *packed_search,
1429                                 const struct bkey_packed *lossy_packed_search)
1430 {
1431         struct bkey_packed *m = __bch2_bset_search(b, t, search,
1432                                                    lossy_packed_search);
1433
1434         return bch2_bset_search_linear(b, t, search,
1435                                  packed_search, lossy_packed_search, m);
1436 }
1437
1438 /* Btree node iterator */
1439
1440 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
1441                               struct btree *b,
1442                               const struct bkey_packed *k,
1443                               const struct bkey_packed *end)
1444 {
1445         if (k != end) {
1446                 struct btree_node_iter_set *pos;
1447
1448                 btree_node_iter_for_each(iter, pos)
1449                         ;
1450
1451                 BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
1452                 *pos = (struct btree_node_iter_set) {
1453                         __btree_node_key_to_offset(b, k),
1454                         __btree_node_key_to_offset(b, end)
1455                 };
1456         }
1457 }
1458
1459 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1460                                struct btree *b,
1461                                const struct bkey_packed *k,
1462                                const struct bkey_packed *end)
1463 {
1464         __bch2_btree_node_iter_push(iter, b, k, end);
1465         bch2_btree_node_iter_sort(iter, b);
1466 }
1467
1468 noinline __flatten __attribute__((cold))
1469 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1470                               struct btree *b, struct bpos *search)
1471 {
1472         struct bset_tree *t;
1473
1474         trace_bkey_pack_pos_fail(search);
1475
1476         for_each_bset(b, t)
1477                 __bch2_btree_node_iter_push(iter, b,
1478                         bch2_bset_search(b, t, search, NULL, NULL),
1479                         btree_bkey_last(b, t));
1480
1481         bch2_btree_node_iter_sort(iter, b);
1482 }
1483
1484 /**
1485  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1486  * given position
1487  *
1488  * Main entry point to the lookup code for individual btree nodes:
1489  *
1490  * NOTE:
1491  *
1492  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1493  * keys. This doesn't matter for most code, but it does matter for lookups.
1494  *
1495  * Some adjacent keys with a string of equal keys:
1496  *      i j k k k k l m
1497  *
1498  * If you search for k, the lookup code isn't guaranteed to return you any
1499  * specific k. The lookup code is conceptually doing a binary search and
1500  * iterating backwards is very expensive so if the pivot happens to land at the
1501  * last k that's what you'll get.
1502  *
1503  * This works out ok, but it's something to be aware of:
1504  *
1505  *  - For non extents, we guarantee that the live key comes last - see
1506  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1507  *    see will only be deleted keys you don't care about.
1508  *
1509  *  - For extents, deleted keys sort last (see the comment at the top of this
1510  *    file). But when you're searching for extents, you actually want the first
1511  *    key strictly greater than your search key - an extent that compares equal
1512  *    to the search key is going to have 0 sectors after the search key.
1513  *
1514  *    But this does mean that we can't just search for
1515  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1516  *    the range we want - if we're unlucky and there's an extent that ends
1517  *    exactly where we searched, then there could be a deleted key at the same
1518  *    position and we'd get that when we search instead of the preceding extent
1519  *    we needed.
1520  *
1521  *    So we've got to search for start_of_range, then after the lookup iterate
1522  *    past any extents that compare equal to the position we searched for.
1523  */
1524 __flatten
1525 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1526                                struct btree *b, struct bpos *search)
1527 {
1528         struct bkey_packed p, *packed_search = NULL;
1529         struct btree_node_iter_set *pos = iter->data;
1530         struct bkey_packed *k[MAX_BSETS];
1531         unsigned i;
1532
1533         EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
1534         bset_aux_tree_verify(b);
1535
1536         memset(iter, 0, sizeof(*iter));
1537
1538         switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
1539         case BKEY_PACK_POS_EXACT:
1540                 packed_search = &p;
1541                 break;
1542         case BKEY_PACK_POS_SMALLER:
1543                 packed_search = NULL;
1544                 break;
1545         case BKEY_PACK_POS_FAIL:
1546                 btree_node_iter_init_pack_failed(iter, b, search);
1547                 return;
1548         }
1549
1550         for (i = 0; i < b->nsets; i++) {
1551                 k[i] = __bch2_bset_search(b, b->set + i, search, &p);
1552                 prefetch_four_cachelines(k[i]);
1553         }
1554
1555         for (i = 0; i < b->nsets; i++) {
1556                 struct bset_tree *t = b->set + i;
1557                 struct bkey_packed *end = btree_bkey_last(b, t);
1558
1559                 k[i] = bch2_bset_search_linear(b, t, search,
1560                                                packed_search, &p, k[i]);
1561                 if (k[i] != end)
1562                         *pos++ = (struct btree_node_iter_set) {
1563                                 __btree_node_key_to_offset(b, k[i]),
1564                                 __btree_node_key_to_offset(b, end)
1565                         };
1566         }
1567
1568         bch2_btree_node_iter_sort(iter, b);
1569 }
1570
1571 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1572                                           struct btree *b)
1573 {
1574         struct bset_tree *t;
1575
1576         memset(iter, 0, sizeof(*iter));
1577
1578         for_each_bset(b, t)
1579                 __bch2_btree_node_iter_push(iter, b,
1580                                            btree_bkey_first(b, t),
1581                                            btree_bkey_last(b, t));
1582         bch2_btree_node_iter_sort(iter, b);
1583 }
1584
1585 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1586                                                   struct btree *b,
1587                                                   struct bset_tree *t)
1588 {
1589         struct btree_node_iter_set *set;
1590
1591         btree_node_iter_for_each(iter, set)
1592                 if (set->end == t->end_offset)
1593                         return __btree_node_offset_to_key(b, set->k);
1594
1595         return btree_bkey_last(b, t);
1596 }
1597
1598 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1599                                             struct btree *b,
1600                                             unsigned first)
1601 {
1602         bool ret;
1603
1604         if ((ret = (btree_node_iter_cmp(b,
1605                                         iter->data[first],
1606                                         iter->data[first + 1]) > 0)))
1607                 swap(iter->data[first], iter->data[first + 1]);
1608         return ret;
1609 }
1610
1611 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1612                                struct btree *b)
1613 {
1614         /* unrolled bubble sort: */
1615
1616         if (!__btree_node_iter_set_end(iter, 2)) {
1617                 btree_node_iter_sort_two(iter, b, 0);
1618                 btree_node_iter_sort_two(iter, b, 1);
1619         }
1620
1621         if (!__btree_node_iter_set_end(iter, 1))
1622                 btree_node_iter_sort_two(iter, b, 0);
1623 }
1624
1625 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1626                                    struct btree_node_iter_set *set)
1627 {
1628         struct btree_node_iter_set *last =
1629                 iter->data + ARRAY_SIZE(iter->data) - 1;
1630
1631         memmove(&set[0], &set[1], (void *) last - (void *) set);
1632         *last = (struct btree_node_iter_set) { 0, 0 };
1633 }
1634
1635 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1636                                                   struct btree *b)
1637 {
1638         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1639
1640         EBUG_ON(iter->data->k > iter->data->end);
1641
1642         while (!__btree_node_iter_set_end(iter, 0) &&
1643                !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
1644                 iter->data->k++;
1645
1646         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1647                 bch2_btree_node_iter_set_drop(iter, iter->data);
1648                 return;
1649         }
1650
1651         if (__btree_node_iter_set_end(iter, 1))
1652                 return;
1653
1654         if (!btree_node_iter_sort_two(iter, b, 0))
1655                 return;
1656
1657         if (__btree_node_iter_set_end(iter, 2))
1658                 return;
1659
1660         btree_node_iter_sort_two(iter, b, 1);
1661 }
1662
1663 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1664                                   struct btree *b)
1665 {
1666         if (btree_keys_expensive_checks(b)) {
1667                 bch2_btree_node_iter_verify(iter, b);
1668                 bch2_btree_node_iter_next_check(iter, b);
1669         }
1670
1671         __bch2_btree_node_iter_advance(iter, b);
1672 }
1673
1674 /*
1675  * Expensive:
1676  */
1677 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
1678                                                   struct btree *b)
1679 {
1680         struct bkey_packed *k, *prev = NULL;
1681         struct btree_node_iter_set *set;
1682         struct bset_tree *t;
1683         unsigned end = 0;
1684
1685         bch2_btree_node_iter_verify(iter, b);
1686
1687         for_each_bset(b, t) {
1688                 k = bch2_bkey_prev_all(b, t,
1689                         bch2_btree_node_iter_bset_pos(iter, b, t));
1690                 if (k &&
1691                     (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
1692                         prev = k;
1693                         end = t->end_offset;
1694                 }
1695         }
1696
1697         if (!prev)
1698                 return NULL;
1699
1700         /*
1701          * We're manually memmoving instead of just calling sort() to ensure the
1702          * prev we picked ends up in slot 0 - sort won't necessarily put it
1703          * there because of duplicate deleted keys:
1704          */
1705         btree_node_iter_for_each(iter, set)
1706                 if (set->end == end)
1707                         goto found;
1708
1709         BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
1710 found:
1711         BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
1712
1713         memmove(&iter->data[1],
1714                 &iter->data[0],
1715                 (void *) set - (void *) &iter->data[0]);
1716
1717         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1718         iter->data[0].end = end;
1719
1720         bch2_btree_node_iter_verify(iter, b);
1721         return prev;
1722 }
1723
1724 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
1725                                                      struct btree *b,
1726                                                      unsigned min_key_type)
1727 {
1728         struct bkey_packed *prev;
1729
1730         do {
1731                 prev = bch2_btree_node_iter_prev_all(iter, b);
1732         } while (prev && prev->type < min_key_type);
1733
1734         return prev;
1735 }
1736
1737 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1738                                                  struct btree *b,
1739                                                  struct bkey *u)
1740 {
1741         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1742
1743         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1744 }
1745
1746 /* Mergesort */
1747
1748 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1749 {
1750         struct bset_tree *t;
1751
1752         for_each_bset(b, t) {
1753                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1754                 size_t j;
1755
1756                 stats->sets[type].nr++;
1757                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1758                         sizeof(u64);
1759
1760                 if (bset_has_ro_aux_tree(t)) {
1761                         stats->floats += t->size - 1;
1762
1763                         for (j = 1; j < t->size; j++)
1764                                 stats->failed +=
1765                                         bkey_float(b, t, j)->exponent ==
1766                                         BFLOAT_FAILED;
1767                 }
1768         }
1769 }
1770
1771 void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
1772                          struct bkey_packed *k)
1773 {
1774         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1775         struct bkey uk;
1776         unsigned j, inorder;
1777
1778         if (out->pos != out->end)
1779                 *out->pos = '\0';
1780
1781         if (!bset_has_ro_aux_tree(t))
1782                 return;
1783
1784         inorder = bkey_to_cacheline(b, t, k);
1785         if (!inorder || inorder >= t->size)
1786                 return;
1787
1788         j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1789         if (k != tree_to_bkey(b, t, j))
1790                 return;
1791
1792         switch (bkey_float(b, t, j)->exponent) {
1793         case BFLOAT_FAILED:
1794                 uk = bkey_unpack_key(b, k);
1795                 pr_buf(out,
1796                        "    failed unpacked at depth %u\n"
1797                        "\t%llu:%llu\n",
1798                        ilog2(j),
1799                        uk.p.inode, uk.p.offset);
1800                 break;
1801         }
1802 }