]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/bset.c
Update bcachefs sources to d868a87c67 bcachefs: fix initial gc
[bcachefs-tools-debian] / libbcachefs / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "btree_cache.h"
10 #include "bset.h"
11 #include "eytzinger.h"
12 #include "util.h"
13
14 #include <asm/unaligned.h>
15 #include <linux/dynamic_fault.h>
16 #include <linux/console.h>
17 #include <linux/random.h>
18 #include <linux/prefetch.h>
19
20 /* hack.. */
21 #include "alloc_types.h"
22 #include <trace/events/bcachefs.h>
23
24 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
25                                                   struct btree *);
26
27 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
28 {
29         unsigned offset = __btree_node_key_to_offset(b, k);
30         struct bset_tree *t;
31
32         for_each_bset(b, t)
33                 if (offset <= t->end_offset) {
34                         EBUG_ON(offset < btree_bkey_first_offset(t));
35                         return t;
36                 }
37
38         BUG();
39 }
40
41 /*
42  * There are never duplicate live keys in the btree - but including keys that
43  * have been flagged as deleted (and will be cleaned up later) we _will_ see
44  * duplicates.
45  *
46  * Thus the sort order is: usual key comparison first, but for keys that compare
47  * equal the deleted key(s) come first, and the (at most one) live version comes
48  * last.
49  *
50  * The main reason for this is insertion: to handle overwrites, we first iterate
51  * over keys that compare equal to our insert key, and then insert immediately
52  * prior to the first key greater than the key we're inserting - our insert
53  * position will be after all keys that compare equal to our insert key, which
54  * by the time we actually do the insert will all be deleted.
55  */
56
57 void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
58 {
59         struct bkey_packed *_k, *_n;
60         struct bkey k, n;
61         char buf[120];
62
63         if (!i->u64s)
64                 return;
65
66         for (_k = i->start, k = bkey_unpack_key(b, _k);
67              _k < vstruct_last(i);
68              _k = _n, k = n) {
69                 _n = bkey_next(_k);
70
71                 bch2_bkey_to_text(&PBUF(buf), &k);
72                 printk(KERN_ERR "block %u key %5u: %s\n", set,
73                        __btree_node_key_to_offset(b, _k), buf);
74
75                 if (_n == vstruct_last(i))
76                         continue;
77
78                 n = bkey_unpack_key(b, _n);
79
80                 if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
81                         printk(KERN_ERR "Key skipped backwards\n");
82                         continue;
83                 }
84
85                 /*
86                  * Weird check for duplicate non extent keys: extents are
87                  * deleted iff they have 0 size, so if it has zero size and it's
88                  * not deleted these aren't extents:
89                  */
90                 if (((!k.size && !bkey_deleted(&k)) ||
91                      (!n.size && !bkey_deleted(&n))) &&
92                     !bkey_deleted(&k) &&
93                     !bkey_cmp(n.p, k.p))
94                         printk(KERN_ERR "Duplicate keys\n");
95         }
96 }
97
98 void bch2_dump_btree_node(struct btree *b)
99 {
100         struct bset_tree *t;
101
102         console_lock();
103         for_each_bset(b, t)
104                 bch2_dump_bset(b, bset(b, t), t - b->set);
105         console_unlock();
106 }
107
108 void bch2_dump_btree_node_iter(struct btree *b,
109                               struct btree_node_iter *iter)
110 {
111         struct btree_node_iter_set *set;
112
113         printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
114
115         btree_node_iter_for_each(iter, set) {
116                 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
117                 struct bset_tree *t = bch2_bkey_to_bset(b, k);
118                 struct bkey uk = bkey_unpack_key(b, k);
119                 char buf[100];
120
121                 bch2_bkey_to_text(&PBUF(buf), &uk);
122                 printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
123                        k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
124         }
125 }
126
127 #ifdef CONFIG_BCACHEFS_DEBUG
128
129 void __bch2_verify_btree_nr_keys(struct btree *b)
130 {
131         struct bset_tree *t;
132         struct bkey_packed *k;
133         struct btree_nr_keys nr = { 0 };
134
135         for_each_bset(b, t)
136                 for (k = btree_bkey_first(b, t);
137                      k != btree_bkey_last(b, t);
138                      k = bkey_next(k))
139                         if (!bkey_whiteout(k))
140                                 btree_keys_account_key_add(&nr, t - b->set, k);
141
142         BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
143 }
144
145 static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
146                                             struct btree *b)
147 {
148         struct btree_node_iter iter = *_iter;
149         const struct bkey_packed *k, *n;
150
151         k = bch2_btree_node_iter_peek_all(&iter, b);
152         __bch2_btree_node_iter_advance(&iter, b);
153         n = bch2_btree_node_iter_peek_all(&iter, b);
154
155         bkey_unpack_key(b, k);
156
157         if (n &&
158             bkey_iter_cmp(b, k, n) > 0) {
159                 struct btree_node_iter_set *set;
160                 struct bkey ku = bkey_unpack_key(b, k);
161                 struct bkey nu = bkey_unpack_key(b, n);
162                 char buf1[80], buf2[80];
163
164                 bch2_dump_btree_node(b);
165                 bch2_bkey_to_text(&PBUF(buf1), &ku);
166                 bch2_bkey_to_text(&PBUF(buf2), &nu);
167                 printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
168                        buf1, buf2);
169                 printk(KERN_ERR "iter was:");
170
171                 btree_node_iter_for_each(_iter, set) {
172                         struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
173                         struct bset_tree *t = bch2_bkey_to_bset(b, k);
174                         printk(" [%zi %zi]", t - b->set,
175                                k->_data - bset(b, t)->_data);
176                 }
177                 panic("\n");
178         }
179 }
180
181 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
182                                  struct btree *b)
183 {
184         struct btree_node_iter_set *set, *s2;
185         struct bset_tree *t;
186
187         /* Verify no duplicates: */
188         btree_node_iter_for_each(iter, set)
189                 btree_node_iter_for_each(iter, s2)
190                         BUG_ON(set != s2 && set->end == s2->end);
191
192         /* Verify that set->end is correct: */
193         btree_node_iter_for_each(iter, set) {
194                 for_each_bset(b, t)
195                         if (set->end == t->end_offset)
196                                 goto found;
197                 BUG();
198 found:
199                 BUG_ON(set->k < btree_bkey_first_offset(t) ||
200                        set->k >= t->end_offset);
201         }
202
203         /* Verify iterator is sorted: */
204         btree_node_iter_for_each(iter, set)
205                 BUG_ON(set != iter->data &&
206                        btree_node_iter_cmp(b, set[-1], set[0]) > 0);
207 }
208
209 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
210                             struct bkey_packed *insert, unsigned clobber_u64s)
211 {
212         struct bset_tree *t = bch2_bkey_to_bset(b, where);
213         struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
214         struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
215 #if 0
216         BUG_ON(prev &&
217                bkey_iter_cmp(b, prev, insert) > 0);
218 #else
219         if (prev &&
220             bkey_iter_cmp(b, prev, insert) > 0) {
221                 struct bkey k1 = bkey_unpack_key(b, prev);
222                 struct bkey k2 = bkey_unpack_key(b, insert);
223                 char buf1[100];
224                 char buf2[100];
225
226                 bch2_dump_btree_node(b);
227                 bch2_bkey_to_text(&PBUF(buf1), &k1);
228                 bch2_bkey_to_text(&PBUF(buf2), &k2);
229
230                 panic("prev > insert:\n"
231                       "prev    key %5u %s\n"
232                       "insert  key %5u %s\n",
233                        __btree_node_key_to_offset(b, prev), buf1,
234                        __btree_node_key_to_offset(b, insert), buf2);
235         }
236 #endif
237 #if 0
238         BUG_ON(next != btree_bkey_last(b, t) &&
239                bkey_iter_cmp(b, insert, next) > 0);
240 #else
241         if (next != btree_bkey_last(b, t) &&
242             bkey_iter_cmp(b, insert, next) > 0) {
243                 struct bkey k1 = bkey_unpack_key(b, insert);
244                 struct bkey k2 = bkey_unpack_key(b, next);
245                 char buf1[100];
246                 char buf2[100];
247
248                 bch2_dump_btree_node(b);
249                 bch2_bkey_to_text(&PBUF(buf1), &k1);
250                 bch2_bkey_to_text(&PBUF(buf2), &k2);
251
252                 panic("insert > next:\n"
253                       "insert  key %5u %s\n"
254                       "next    key %5u %s\n",
255                        __btree_node_key_to_offset(b, insert), buf1,
256                        __btree_node_key_to_offset(b, next), buf2);
257         }
258 #endif
259 }
260
261 #else
262
263 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
264                                                    struct btree *b) {}
265
266 #endif
267
268 /* Auxiliary search trees */
269
270 #define BFLOAT_FAILED_UNPACKED  (U8_MAX - 0)
271 #define BFLOAT_FAILED_PREV      (U8_MAX - 1)
272 #define BFLOAT_FAILED_OVERFLOW  (U8_MAX - 2)
273 #define BFLOAT_FAILED           (U8_MAX - 2)
274
275 #define KEY_WORDS               BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
276
277 struct bkey_float {
278         u8              exponent;
279         u8              key_offset;
280         union {
281                 u32     mantissa32;
282         struct {
283                 u16     mantissa16;
284                 u16     _pad;
285         };
286         };
287 } __packed;
288
289 #define BFLOAT_32BIT_NR         32U
290
291 static unsigned bkey_float_byte_offset(unsigned idx)
292 {
293         int d = (idx - BFLOAT_32BIT_NR) << 1;
294
295         d &= ~(d >> 31);
296
297         return idx * 6 - d;
298 }
299
300 struct ro_aux_tree {
301         struct bkey_float       _d[0];
302 };
303
304 struct rw_aux_tree {
305         u16             offset;
306         struct bpos     k;
307 };
308
309 /*
310  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
311  * it used to be 64, but I realized the lookup code would touch slightly less
312  * memory if it was 128.
313  *
314  * It definites the number of bytes (in struct bset) per struct bkey_float in
315  * the auxiliar search tree - when we're done searching the bset_float tree we
316  * have this many bytes left that we do a linear search over.
317  *
318  * Since (after level 5) every level of the bset_tree is on a new cacheline,
319  * we're touching one fewer cacheline in the bset tree in exchange for one more
320  * cacheline in the linear search - but the linear search might stop before it
321  * gets to the second cacheline.
322  */
323
324 #define BSET_CACHELINE          128
325
326 /* Space required for the btree node keys */
327 static inline size_t btree_keys_bytes(struct btree *b)
328 {
329         return PAGE_SIZE << b->page_order;
330 }
331
332 static inline size_t btree_keys_cachelines(struct btree *b)
333 {
334         return btree_keys_bytes(b) / BSET_CACHELINE;
335 }
336
337 static inline size_t btree_aux_data_bytes(struct btree *b)
338 {
339         return btree_keys_cachelines(b) * 8;
340 }
341
342 static inline size_t btree_aux_data_u64s(struct btree *b)
343 {
344         return btree_aux_data_bytes(b) / sizeof(u64);
345 }
346
347 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
348 {
349         BUG_ON(t->aux_data_offset == U16_MAX);
350
351         switch (bset_aux_tree_type(t)) {
352         case BSET_NO_AUX_TREE:
353                 return t->aux_data_offset;
354         case BSET_RO_AUX_TREE:
355                 return t->aux_data_offset +
356                         DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
357                                      sizeof(u8) * t->size, 8);
358         case BSET_RW_AUX_TREE:
359                 return t->aux_data_offset +
360                         DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
361         default:
362                 BUG();
363         }
364 }
365
366 static unsigned bset_aux_tree_buf_start(const struct btree *b,
367                                         const struct bset_tree *t)
368 {
369         return t == b->set
370                 ? DIV_ROUND_UP(b->unpack_fn_len, 8)
371                 : bset_aux_tree_buf_end(t - 1);
372 }
373
374 static void *__aux_tree_base(const struct btree *b,
375                              const struct bset_tree *t)
376 {
377         return b->aux_data + t->aux_data_offset * 8;
378 }
379
380 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
381                                             const struct bset_tree *t)
382 {
383         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
384
385         return __aux_tree_base(b, t);
386 }
387
388 static u8 *ro_aux_tree_prev(const struct btree *b,
389                             const struct bset_tree *t)
390 {
391         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
392
393         return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
394 }
395
396 static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
397                                          unsigned idx)
398 {
399         return (void *) b + bkey_float_byte_offset(idx);
400 }
401
402 static struct bkey_float *bkey_float(const struct btree *b,
403                                      const struct bset_tree *t,
404                                      unsigned idx)
405 {
406         return bkey_float_get(ro_aux_tree_base(b, t), idx);
407 }
408
409 static void bset_aux_tree_verify(struct btree *b)
410 {
411 #ifdef CONFIG_BCACHEFS_DEBUG
412         struct bset_tree *t;
413
414         for_each_bset(b, t) {
415                 if (t->aux_data_offset == U16_MAX)
416                         continue;
417
418                 BUG_ON(t != b->set &&
419                        t[-1].aux_data_offset == U16_MAX);
420
421                 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
422                 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
423                 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
424         }
425 #endif
426 }
427
428 /* Memory allocation */
429
430 void bch2_btree_keys_free(struct btree *b)
431 {
432         vfree(b->aux_data);
433         b->aux_data = NULL;
434 }
435
436 #ifndef PAGE_KERNEL_EXEC
437 # define PAGE_KERNEL_EXEC PAGE_KERNEL
438 #endif
439
440 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
441 {
442         b->page_order   = page_order;
443         b->aux_data     = __vmalloc(btree_aux_data_bytes(b), gfp,
444                                     PAGE_KERNEL_EXEC);
445         if (!b->aux_data)
446                 return -ENOMEM;
447
448         return 0;
449 }
450
451 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
452 {
453         unsigned i;
454
455         b->nsets                = 0;
456         memset(&b->nr, 0, sizeof(b->nr));
457 #ifdef CONFIG_BCACHEFS_DEBUG
458         b->expensive_debug_checks = expensive_debug_checks;
459 #endif
460         for (i = 0; i < MAX_BSETS; i++)
461                 b->set[i].data_offset = U16_MAX;
462
463         bch2_bset_set_no_aux_tree(b, b->set);
464 }
465
466 /* Binary tree stuff for auxiliary search trees */
467
468 /*
469  * Cacheline/offset <-> bkey pointer arithmetic:
470  *
471  * t->tree is a binary search tree in an array; each node corresponds to a key
472  * in one cacheline in t->set (BSET_CACHELINE bytes).
473  *
474  * This means we don't have to store the full index of the key that a node in
475  * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
476  * then bkey_float->m gives us the offset within that cacheline, in units of 8
477  * bytes.
478  *
479  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
480  * make this work.
481  *
482  * To construct the bfloat for an arbitrary key we need to know what the key
483  * immediately preceding it is: we have to check if the two keys differ in the
484  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
485  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
486  */
487
488 static inline void *bset_cacheline(const struct btree *b,
489                                    const struct bset_tree *t,
490                                    unsigned cacheline)
491 {
492         return (void *) round_down((unsigned long) btree_bkey_first(b, t),
493                                    L1_CACHE_BYTES) +
494                 cacheline * BSET_CACHELINE;
495 }
496
497 static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
498                                              const struct bset_tree *t,
499                                              unsigned cacheline,
500                                              unsigned offset)
501 {
502         return bset_cacheline(b, t, cacheline) + offset * 8;
503 }
504
505 static unsigned bkey_to_cacheline(const struct btree *b,
506                                   const struct bset_tree *t,
507                                   const struct bkey_packed *k)
508 {
509         return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
510 }
511
512 static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
513                                           const struct bset_tree *t,
514                                           unsigned cacheline,
515                                           const struct bkey_packed *k)
516 {
517         return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
518 }
519
520 static unsigned bkey_to_cacheline_offset(const struct btree *b,
521                                          const struct bset_tree *t,
522                                          unsigned cacheline,
523                                          const struct bkey_packed *k)
524 {
525         size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
526
527         EBUG_ON(m > U8_MAX);
528         return m;
529 }
530
531 static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
532                                                const struct bset_tree *t,
533                                                unsigned j)
534 {
535         return cacheline_to_bkey(b, t,
536                         __eytzinger1_to_inorder(j, t->size, t->extra),
537                         bkey_float(b, t, j)->key_offset);
538 }
539
540 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
541                                              const struct bset_tree *t,
542                                              unsigned j)
543 {
544         unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
545
546         return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
547 }
548
549 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
550                                        const struct bset_tree *t)
551 {
552         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
553
554         return __aux_tree_base(b, t);
555 }
556
557 /*
558  * For the write set - the one we're currently inserting keys into - we don't
559  * maintain a full search tree, we just keep a simple lookup table in t->prev.
560  */
561 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
562                                           struct bset_tree *t,
563                                           unsigned j)
564 {
565         return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
566 }
567
568 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
569                             unsigned j, struct bkey_packed *k)
570 {
571         EBUG_ON(k >= btree_bkey_last(b, t));
572
573         rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
574                 .offset = __btree_node_key_to_offset(b, k),
575                 .k      = bkey_unpack_pos(b, k),
576         };
577 }
578
579 static void bch2_bset_verify_rw_aux_tree(struct btree *b,
580                                         struct bset_tree *t)
581 {
582         struct bkey_packed *k = btree_bkey_first(b, t);
583         unsigned j = 0;
584
585         if (!btree_keys_expensive_checks(b))
586                 return;
587
588         BUG_ON(bset_has_ro_aux_tree(t));
589
590         if (!bset_has_rw_aux_tree(t))
591                 return;
592
593         BUG_ON(t->size < 1);
594         BUG_ON(rw_aux_to_bkey(b, t, j) != k);
595
596         goto start;
597         while (1) {
598                 if (rw_aux_to_bkey(b, t, j) == k) {
599                         BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
600                                         bkey_unpack_pos(b, k)));
601 start:
602                         if (++j == t->size)
603                                 break;
604
605                         BUG_ON(rw_aux_tree(b, t)[j].offset <=
606                                rw_aux_tree(b, t)[j - 1].offset);
607                 }
608
609                 k = bkey_next(k);
610                 BUG_ON(k >= btree_bkey_last(b, t));
611         }
612 }
613
614 /* returns idx of first entry >= offset: */
615 static unsigned rw_aux_tree_bsearch(struct btree *b,
616                                     struct bset_tree *t,
617                                     unsigned offset)
618 {
619         unsigned bset_offs = offset - btree_bkey_first_offset(t);
620         unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
621         unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
622
623         EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
624         EBUG_ON(!t->size);
625         EBUG_ON(idx > t->size);
626
627         while (idx < t->size &&
628                rw_aux_tree(b, t)[idx].offset < offset)
629                 idx++;
630
631         while (idx &&
632                rw_aux_tree(b, t)[idx - 1].offset >= offset)
633                 idx--;
634
635         EBUG_ON(idx < t->size &&
636                 rw_aux_tree(b, t)[idx].offset < offset);
637         EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
638         EBUG_ON(idx + 1 < t->size &&
639                 rw_aux_tree(b, t)[idx].offset ==
640                 rw_aux_tree(b, t)[idx + 1].offset);
641
642         return idx;
643 }
644
645 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
646                                        unsigned idx)
647 {
648         return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
649 }
650
651 static inline void bfloat_mantissa_set(struct bkey_float *f,
652                                        unsigned idx, unsigned mantissa)
653 {
654         if (idx < BFLOAT_32BIT_NR)
655                 f->mantissa32 = mantissa;
656         else
657                 f->mantissa16 = mantissa;
658 }
659
660 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
661                                      const struct bkey_float *f,
662                                      unsigned idx)
663 {
664         u64 v;
665
666         EBUG_ON(!bkey_packed(k));
667
668         v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
669
670         /*
671          * In little endian, we're shifting off low bits (and then the bits we
672          * want are at the low end), in big endian we're shifting off high bits
673          * (and then the bits we want are at the high end, so we shift them
674          * back down):
675          */
676 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
677         v >>= f->exponent & 7;
678 #else
679         v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
680 #endif
681         return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
682 }
683
684 static void make_bfloat(struct btree *b, struct bset_tree *t,
685                         unsigned j,
686                         struct bkey_packed *min_key,
687                         struct bkey_packed *max_key)
688 {
689         struct bkey_float *f = bkey_float(b, t, j);
690         struct bkey_packed *m = tree_to_bkey(b, t, j);
691         struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
692         struct bkey_packed *l, *r;
693         unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
694         unsigned mantissa;
695         int shift, exponent, high_bit;
696
697         EBUG_ON(bkey_next(p) != m);
698
699         if (is_power_of_2(j)) {
700                 l = min_key;
701
702                 if (!l->u64s) {
703                         if (!bkey_pack_pos(l, b->data->min_key, b)) {
704                                 struct bkey_i tmp;
705
706                                 bkey_init(&tmp.k);
707                                 tmp.k.p = b->data->min_key;
708                                 bkey_copy(l, &tmp);
709                         }
710                 }
711         } else {
712                 l = tree_to_prev_bkey(b, t, j >> ffs(j));
713
714                 EBUG_ON(m < l);
715         }
716
717         if (is_power_of_2(j + 1)) {
718                 r = max_key;
719
720                 if (!r->u64s) {
721                         if (!bkey_pack_pos(r, t->max_key, b)) {
722                                 struct bkey_i tmp;
723
724                                 bkey_init(&tmp.k);
725                                 tmp.k.p = t->max_key;
726                                 bkey_copy(r, &tmp);
727                         }
728                 }
729         } else {
730                 r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
731
732                 EBUG_ON(m > r);
733         }
734
735         /*
736          * for failed bfloats, the lookup code falls back to comparing against
737          * the original key.
738          */
739
740         if (!bkey_packed(l) || !bkey_packed(r) ||
741             !bkey_packed(p) || !bkey_packed(m) ||
742             !b->nr_key_bits) {
743                 f->exponent = BFLOAT_FAILED_UNPACKED;
744                 return;
745         }
746
747         /*
748          * The greatest differing bit of l and r is the first bit we must
749          * include in the bfloat mantissa we're creating in order to do
750          * comparisons - that bit always becomes the high bit of
751          * bfloat->mantissa, and thus the exponent we're calculating here is
752          * the position of what will become the low bit in bfloat->mantissa:
753          *
754          * Note that this may be negative - we may be running off the low end
755          * of the key: we handle this later:
756          */
757         high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
758                        min_t(unsigned, bits, b->nr_key_bits) - 1);
759         exponent = high_bit - (bits - 1);
760
761         /*
762          * Then we calculate the actual shift value, from the start of the key
763          * (k->_data), to get the key bits starting at exponent:
764          */
765 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
766         shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
767
768         EBUG_ON(shift + bits > b->format.key_u64s * 64);
769 #else
770         shift = high_bit_offset +
771                 b->nr_key_bits -
772                 exponent -
773                 bits;
774
775         EBUG_ON(shift < KEY_PACKED_BITS_START);
776 #endif
777         EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
778
779         f->exponent = shift;
780         mantissa = bkey_mantissa(m, f, j);
781
782         /*
783          * If we've got garbage bits, set them to all 1s - it's legal for the
784          * bfloat to compare larger than the original key, but not smaller:
785          */
786         if (exponent < 0)
787                 mantissa |= ~(~0U << -exponent);
788
789         bfloat_mantissa_set(f, j, mantissa);
790
791         /*
792          * The bfloat must be able to tell its key apart from the previous key -
793          * if its key and the previous key don't differ in the required bits,
794          * flag as failed - unless the keys are actually equal, in which case
795          * we aren't required to return a specific one:
796          */
797         if (exponent > 0 &&
798             bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
799             bkey_cmp_packed(b, p, m)) {
800                 f->exponent = BFLOAT_FAILED_PREV;
801                 return;
802         }
803
804         /*
805          * f->mantissa must compare >= the original key - for transitivity with
806          * the comparison in bset_search_tree. If we're dropping set bits,
807          * increment it:
808          */
809         if (exponent > (int) bch2_bkey_ffs(b, m)) {
810                 if (j < BFLOAT_32BIT_NR
811                     ? f->mantissa32 == U32_MAX
812                     : f->mantissa16 == U16_MAX)
813                         f->exponent = BFLOAT_FAILED_OVERFLOW;
814
815                 if (j < BFLOAT_32BIT_NR)
816                         f->mantissa32++;
817                 else
818                         f->mantissa16++;
819         }
820 }
821
822 /* bytes remaining - only valid for last bset: */
823 static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
824 {
825         bset_aux_tree_verify(b);
826
827         return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
828 }
829
830 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
831 {
832         unsigned bytes = __bset_tree_capacity(b, t);
833
834         if (bytes < 7 * BFLOAT_32BIT_NR)
835                 return bytes / 7;
836
837         bytes -= 7 * BFLOAT_32BIT_NR;
838
839         return BFLOAT_32BIT_NR + bytes / 5;
840 }
841
842 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
843 {
844         return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
845 }
846
847 static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
848 {
849         struct bkey_packed *k;
850
851         t->size = 1;
852         t->extra = BSET_RW_AUX_TREE_VAL;
853         rw_aux_tree(b, t)[0].offset =
854                 __btree_node_key_to_offset(b, btree_bkey_first(b, t));
855
856         for (k = btree_bkey_first(b, t);
857              k != btree_bkey_last(b, t);
858              k = bkey_next(k)) {
859                 if (t->size == bset_rw_tree_capacity(b, t))
860                         break;
861
862                 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
863                     L1_CACHE_BYTES)
864                         rw_aux_tree_set(b, t, t->size++, k);
865         }
866 }
867
868 static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
869 {
870         struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
871         struct bkey_packed min_key, max_key;
872         unsigned j, cacheline = 1;
873
874         /* signal to make_bfloat() that they're uninitialized: */
875         min_key.u64s = max_key.u64s = 0;
876
877         t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
878                       bset_ro_tree_capacity(b, t));
879 retry:
880         if (t->size < 2) {
881                 t->size = 0;
882                 t->extra = BSET_NO_AUX_TREE_VAL;
883                 return;
884         }
885
886         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
887
888         /* First we figure out where the first key in each cacheline is */
889         eytzinger1_for_each(j, t->size) {
890                 while (bkey_to_cacheline(b, t, k) < cacheline)
891                         prev = k, k = bkey_next(k);
892
893                 if (k >= btree_bkey_last(b, t)) {
894                         /* XXX: this path sucks */
895                         t->size--;
896                         goto retry;
897                 }
898
899                 ro_aux_tree_prev(b, t)[j] = prev->u64s;
900                 bkey_float(b, t, j)->key_offset =
901                         bkey_to_cacheline_offset(b, t, cacheline++, k);
902
903                 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
904                 EBUG_ON(tree_to_bkey(b, t, j) != k);
905         }
906
907         while (bkey_next(k) != btree_bkey_last(b, t))
908                 k = bkey_next(k);
909
910         t->max_key = bkey_unpack_pos(b, k);
911
912         /* Then we build the tree */
913         eytzinger1_for_each(j, t->size)
914                 make_bfloat(b, t, j, &min_key, &max_key);
915 }
916
917 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
918 {
919         struct bset_tree *i;
920
921         for (i = b->set; i != t; i++)
922                 BUG_ON(bset_has_rw_aux_tree(i));
923
924         bch2_bset_set_no_aux_tree(b, t);
925
926         /* round up to next cacheline: */
927         t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
928                                       SMP_CACHE_BYTES / sizeof(u64));
929
930         bset_aux_tree_verify(b);
931 }
932
933 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
934                              bool writeable)
935 {
936         if (writeable
937             ? bset_has_rw_aux_tree(t)
938             : bset_has_ro_aux_tree(t))
939                 return;
940
941         bset_alloc_tree(b, t);
942
943         if (!__bset_tree_capacity(b, t))
944                 return;
945
946         if (writeable)
947                 __build_rw_aux_tree(b, t);
948         else
949                 __build_ro_aux_tree(b, t);
950
951         bset_aux_tree_verify(b);
952 }
953
954 void bch2_bset_init_first(struct btree *b, struct bset *i)
955 {
956         struct bset_tree *t;
957
958         BUG_ON(b->nsets);
959
960         memset(i, 0, sizeof(*i));
961         get_random_bytes(&i->seq, sizeof(i->seq));
962         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
963
964         t = &b->set[b->nsets++];
965         set_btree_bset(b, t, i);
966 }
967
968 void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
969                          struct btree_node_entry *bne)
970 {
971         struct bset *i = &bne->keys;
972         struct bset_tree *t;
973
974         BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
975         BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
976         BUG_ON(b->nsets >= MAX_BSETS);
977
978         memset(i, 0, sizeof(*i));
979         i->seq = btree_bset_first(b)->seq;
980         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
981
982         t = &b->set[b->nsets++];
983         set_btree_bset(b, t, i);
984 }
985
986 /*
987  * find _some_ key in the same bset as @k that precedes @k - not necessarily the
988  * immediate predecessor:
989  */
990 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
991                                        struct bkey_packed *k)
992 {
993         struct bkey_packed *p;
994         unsigned offset;
995         int j;
996
997         EBUG_ON(k < btree_bkey_first(b, t) ||
998                 k > btree_bkey_last(b, t));
999
1000         if (k == btree_bkey_first(b, t))
1001                 return NULL;
1002
1003         switch (bset_aux_tree_type(t)) {
1004         case BSET_NO_AUX_TREE:
1005                 p = btree_bkey_first(b, t);
1006                 break;
1007         case BSET_RO_AUX_TREE:
1008                 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
1009
1010                 do {
1011                         p = j ? tree_to_bkey(b, t,
1012                                         __inorder_to_eytzinger1(j--,
1013                                                         t->size, t->extra))
1014                               : btree_bkey_first(b, t);
1015                 } while (p >= k);
1016                 break;
1017         case BSET_RW_AUX_TREE:
1018                 offset = __btree_node_key_to_offset(b, k);
1019                 j = rw_aux_tree_bsearch(b, t, offset);
1020                 p = j ? rw_aux_to_bkey(b, t, j - 1)
1021                       : btree_bkey_first(b, t);
1022                 break;
1023         }
1024
1025         return p;
1026 }
1027
1028 struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
1029                                           struct bset_tree *t,
1030                                           struct bkey_packed *k,
1031                                           unsigned min_key_type)
1032 {
1033         struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
1034
1035         while ((p = __bkey_prev(b, t, k)) && !ret) {
1036                 for (i = p; i != k; i = bkey_next(i))
1037                         if (i->type >= min_key_type)
1038                                 ret = i;
1039
1040                 k = p;
1041         }
1042
1043         if (btree_keys_expensive_checks(b)) {
1044                 BUG_ON(ret >= orig_k);
1045
1046                 for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
1047                      i != orig_k;
1048                      i = bkey_next(i))
1049                         BUG_ON(i->type >= min_key_type);
1050         }
1051
1052         return ret;
1053 }
1054
1055 /* Insert */
1056
1057 static void rw_aux_tree_fix_invalidated_key(struct btree *b,
1058                                             struct bset_tree *t,
1059                                             struct bkey_packed *k)
1060 {
1061         unsigned offset = __btree_node_key_to_offset(b, k);
1062         unsigned j = rw_aux_tree_bsearch(b, t, offset);
1063
1064         if (j < t->size &&
1065             rw_aux_tree(b, t)[j].offset == offset)
1066                 rw_aux_tree_set(b, t, j, k);
1067
1068         bch2_bset_verify_rw_aux_tree(b, t);
1069 }
1070
1071 static void ro_aux_tree_fix_invalidated_key(struct btree *b,
1072                                             struct bset_tree *t,
1073                                             struct bkey_packed *k)
1074 {
1075         struct bkey_packed min_key, max_key;
1076         unsigned inorder, j;
1077
1078         EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
1079
1080         /* signal to make_bfloat() that they're uninitialized: */
1081         min_key.u64s = max_key.u64s = 0;
1082
1083         if (bkey_next(k) == btree_bkey_last(b, t)) {
1084                 t->max_key = bkey_unpack_pos(b, k);
1085
1086                 for (j = 1; j < t->size; j = j * 2 + 1)
1087                         make_bfloat(b, t, j, &min_key, &max_key);
1088         }
1089
1090         inorder = bkey_to_cacheline(b, t, k);
1091
1092         if (inorder &&
1093             inorder < t->size) {
1094                 j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1095
1096                 if (k == tree_to_bkey(b, t, j)) {
1097                         /* Fix the node this key corresponds to */
1098                         make_bfloat(b, t, j, &min_key, &max_key);
1099
1100                         /* Children for which this key is the right boundary */
1101                         for (j = eytzinger1_left_child(j);
1102                              j < t->size;
1103                              j = eytzinger1_right_child(j))
1104                                 make_bfloat(b, t, j, &min_key, &max_key);
1105                 }
1106         }
1107
1108         if (inorder + 1 < t->size) {
1109                 j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
1110
1111                 if (k == tree_to_prev_bkey(b, t, j)) {
1112                         make_bfloat(b, t, j, &min_key, &max_key);
1113
1114                         /* Children for which this key is the left boundary */
1115                         for (j = eytzinger1_right_child(j);
1116                              j < t->size;
1117                              j = eytzinger1_left_child(j))
1118                                 make_bfloat(b, t, j, &min_key, &max_key);
1119                 }
1120         }
1121 }
1122
1123 /**
1124  * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
1125  * modified, fix any auxiliary search tree by remaking all the nodes in the
1126  * auxiliary search tree that @k corresponds to
1127  */
1128 void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
1129 {
1130         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1131
1132         switch (bset_aux_tree_type(t)) {
1133         case BSET_NO_AUX_TREE:
1134                 break;
1135         case BSET_RO_AUX_TREE:
1136                 ro_aux_tree_fix_invalidated_key(b, t, k);
1137                 break;
1138         case BSET_RW_AUX_TREE:
1139                 rw_aux_tree_fix_invalidated_key(b, t, k);
1140                 break;
1141         }
1142 }
1143
1144 static void bch2_bset_fix_lookup_table(struct btree *b,
1145                                        struct bset_tree *t,
1146                                        struct bkey_packed *_where,
1147                                        unsigned clobber_u64s,
1148                                        unsigned new_u64s)
1149 {
1150         int shift = new_u64s - clobber_u64s;
1151         unsigned l, j, where = __btree_node_key_to_offset(b, _where);
1152
1153         EBUG_ON(bset_has_ro_aux_tree(t));
1154
1155         if (!bset_has_rw_aux_tree(t))
1156                 return;
1157
1158         /* returns first entry >= where */
1159         l = rw_aux_tree_bsearch(b, t, where);
1160
1161         if (!l) /* never delete first entry */
1162                 l++;
1163         else if (l < t->size &&
1164                  where < t->end_offset &&
1165                  rw_aux_tree(b, t)[l].offset == where)
1166                 rw_aux_tree_set(b, t, l++, _where);
1167
1168         /* l now > where */
1169
1170         for (j = l;
1171              j < t->size &&
1172              rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
1173              j++)
1174                 ;
1175
1176         if (j < t->size &&
1177             rw_aux_tree(b, t)[j].offset + shift ==
1178             rw_aux_tree(b, t)[l - 1].offset)
1179                 j++;
1180
1181         memmove(&rw_aux_tree(b, t)[l],
1182                 &rw_aux_tree(b, t)[j],
1183                 (void *) &rw_aux_tree(b, t)[t->size] -
1184                 (void *) &rw_aux_tree(b, t)[j]);
1185         t->size -= j - l;
1186
1187         for (j = l; j < t->size; j++)
1188                rw_aux_tree(b, t)[j].offset += shift;
1189
1190         EBUG_ON(l < t->size &&
1191                 rw_aux_tree(b, t)[l].offset ==
1192                 rw_aux_tree(b, t)[l - 1].offset);
1193
1194         if (t->size < bset_rw_tree_capacity(b, t) &&
1195             (l < t->size
1196              ? rw_aux_tree(b, t)[l].offset
1197              : t->end_offset) -
1198             rw_aux_tree(b, t)[l - 1].offset >
1199             L1_CACHE_BYTES / sizeof(u64)) {
1200                 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
1201                 struct bkey_packed *end = l < t->size
1202                         ? rw_aux_to_bkey(b, t, l)
1203                         : btree_bkey_last(b, t);
1204                 struct bkey_packed *k = start;
1205
1206                 while (1) {
1207                         k = bkey_next(k);
1208                         if (k == end)
1209                                 break;
1210
1211                         if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
1212                                 memmove(&rw_aux_tree(b, t)[l + 1],
1213                                         &rw_aux_tree(b, t)[l],
1214                                         (void *) &rw_aux_tree(b, t)[t->size] -
1215                                         (void *) &rw_aux_tree(b, t)[l]);
1216                                 t->size++;
1217                                 rw_aux_tree_set(b, t, l, k);
1218                                 break;
1219                         }
1220                 }
1221         }
1222
1223         bch2_bset_verify_rw_aux_tree(b, t);
1224         bset_aux_tree_verify(b);
1225 }
1226
1227 void bch2_bset_insert(struct btree *b,
1228                       struct btree_node_iter *iter,
1229                       struct bkey_packed *where,
1230                       struct bkey_i *insert,
1231                       unsigned clobber_u64s)
1232 {
1233         struct bkey_format *f = &b->format;
1234         struct bset_tree *t = bset_tree_last(b);
1235         struct bkey_packed packed, *src = bkey_to_packed(insert);
1236
1237         bch2_bset_verify_rw_aux_tree(b, t);
1238         bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
1239
1240         if (bch2_bkey_pack_key(&packed, &insert->k, f))
1241                 src = &packed;
1242
1243         if (!bkey_whiteout(&insert->k))
1244                 btree_keys_account_key_add(&b->nr, t - b->set, src);
1245
1246         if (src->u64s != clobber_u64s) {
1247                 u64 *src_p = where->_data + clobber_u64s;
1248                 u64 *dst_p = where->_data + src->u64s;
1249
1250                 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
1251                         (int) clobber_u64s - src->u64s);
1252
1253                 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1254                 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
1255                 set_btree_bset_end(b, t);
1256         }
1257
1258         memcpy_u64s(where, src,
1259                     bkeyp_key_u64s(f, src));
1260         memcpy_u64s(bkeyp_val(f, where), &insert->v,
1261                     bkeyp_val_u64s(f, src));
1262
1263         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
1264
1265         bch2_verify_btree_nr_keys(b);
1266 }
1267
1268 void bch2_bset_delete(struct btree *b,
1269                       struct bkey_packed *where,
1270                       unsigned clobber_u64s)
1271 {
1272         struct bset_tree *t = bset_tree_last(b);
1273         u64 *src_p = where->_data + clobber_u64s;
1274         u64 *dst_p = where->_data;
1275
1276         bch2_bset_verify_rw_aux_tree(b, t);
1277
1278         EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
1279
1280         memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
1281         le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
1282         set_btree_bset_end(b, t);
1283
1284         bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
1285 }
1286
1287 /* Lookup */
1288
1289 __flatten
1290 static struct bkey_packed *bset_search_write_set(const struct btree *b,
1291                                 struct bset_tree *t,
1292                                 struct bpos *search,
1293                                 const struct bkey_packed *packed_search)
1294 {
1295         unsigned l = 0, r = t->size;
1296
1297         while (l + 1 != r) {
1298                 unsigned m = (l + r) >> 1;
1299
1300                 if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
1301                         l = m;
1302                 else
1303                         r = m;
1304         }
1305
1306         return rw_aux_to_bkey(b, t, l);
1307 }
1308
1309 noinline
1310 static int bset_search_tree_slowpath(const struct btree *b,
1311                                 struct bset_tree *t, struct bpos *search,
1312                                 const struct bkey_packed *packed_search,
1313                                 unsigned n)
1314 {
1315         return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
1316                                  packed_search, search) < 0;
1317 }
1318
1319 __flatten
1320 static struct bkey_packed *bset_search_tree(const struct btree *b,
1321                                 struct bset_tree *t,
1322                                 struct bpos *search,
1323                                 const struct bkey_packed *packed_search)
1324 {
1325         struct ro_aux_tree *base = ro_aux_tree_base(b, t);
1326         struct bkey_float *f = bkey_float_get(base, 1);
1327         void *p;
1328         unsigned inorder, n = 1;
1329
1330         while (1) {
1331                 if (likely(n << 4 < t->size)) {
1332                         p = bkey_float_get(base, n << 4);
1333                         prefetch(p);
1334                 } else if (n << 3 < t->size) {
1335                         inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
1336                         p = bset_cacheline(b, t, inorder);
1337 #ifdef CONFIG_X86_64
1338                         asm(".intel_syntax noprefix;"
1339                             "prefetcht0 [%0 - 127 + 64 * 0];"
1340                             "prefetcht0 [%0 - 127 + 64 * 1];"
1341                             "prefetcht0 [%0 - 127 + 64 * 2];"
1342                             "prefetcht0 [%0 - 127 + 64 * 3];"
1343                             ".att_syntax prefix;"
1344                             :
1345                             : "r" (p + 127));
1346 #else
1347                         prefetch(p + L1_CACHE_BYTES * 0);
1348                         prefetch(p + L1_CACHE_BYTES * 1);
1349                         prefetch(p + L1_CACHE_BYTES * 2);
1350                         prefetch(p + L1_CACHE_BYTES * 3);
1351 #endif
1352                 } else if (n >= t->size)
1353                         break;
1354
1355                 f = bkey_float_get(base, n);
1356
1357                 if (packed_search &&
1358                     likely(f->exponent < BFLOAT_FAILED))
1359                         n = n * 2 + (bfloat_mantissa(f, n) <
1360                                      bkey_mantissa(packed_search, f, n));
1361                 else
1362                         n = n * 2 + bset_search_tree_slowpath(b, t,
1363                                                 search, packed_search, n);
1364         } while (n < t->size);
1365
1366         inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
1367
1368         /*
1369          * n would have been the node we recursed to - the low bit tells us if
1370          * we recursed left or recursed right.
1371          */
1372         if (n & 1) {
1373                 return cacheline_to_bkey(b, t, inorder, f->key_offset);
1374         } else {
1375                 if (--inorder) {
1376                         n = eytzinger1_prev(n >> 1, t->size);
1377                         f = bkey_float_get(base, n);
1378                         return cacheline_to_bkey(b, t, inorder, f->key_offset);
1379                 } else
1380                         return btree_bkey_first(b, t);
1381         }
1382 }
1383
1384 /*
1385  * Returns the first key greater than or equal to @search
1386  */
1387 __always_inline __flatten
1388 static struct bkey_packed *bch2_bset_search(struct btree *b,
1389                                 struct bset_tree *t,
1390                                 struct bpos *search,
1391                                 struct bkey_packed *packed_search,
1392                                 const struct bkey_packed *lossy_packed_search)
1393 {
1394         struct bkey_packed *m;
1395
1396         /*
1397          * First, we search for a cacheline, then lastly we do a linear search
1398          * within that cacheline.
1399          *
1400          * To search for the cacheline, there's three different possibilities:
1401          *  * The set is too small to have a search tree, so we just do a linear
1402          *    search over the whole set.
1403          *  * The set is the one we're currently inserting into; keeping a full
1404          *    auxiliary search tree up to date would be too expensive, so we
1405          *    use a much simpler lookup table to do a binary search -
1406          *    bset_search_write_set().
1407          *  * Or we use the auxiliary search tree we constructed earlier -
1408          *    bset_search_tree()
1409          */
1410
1411         switch (bset_aux_tree_type(t)) {
1412         case BSET_NO_AUX_TREE:
1413                 m = btree_bkey_first(b, t);
1414                 break;
1415         case BSET_RW_AUX_TREE:
1416                 m = bset_search_write_set(b, t, search, lossy_packed_search);
1417                 break;
1418         case BSET_RO_AUX_TREE:
1419                 /*
1420                  * Each node in the auxiliary search tree covers a certain range
1421                  * of bits, and keys above and below the set it covers might
1422                  * differ outside those bits - so we have to special case the
1423                  * start and end - handle that here:
1424                  */
1425
1426                 if (bkey_cmp(*search, t->max_key) > 0)
1427                         return btree_bkey_last(b, t);
1428
1429                 m = bset_search_tree(b, t, search, lossy_packed_search);
1430                 break;
1431         }
1432
1433         if (lossy_packed_search)
1434                 while (m != btree_bkey_last(b, t) &&
1435                        bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
1436                                               m) > 0)
1437                         m = bkey_next(m);
1438
1439         if (!packed_search)
1440                 while (m != btree_bkey_last(b, t) &&
1441                        bkey_iter_pos_cmp(b, search, m) > 0)
1442                         m = bkey_next(m);
1443
1444         if (btree_keys_expensive_checks(b)) {
1445                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1446
1447                 BUG_ON(prev &&
1448                        bkey_iter_cmp_p_or_unp(b, search, packed_search,
1449                                               prev) <= 0);
1450         }
1451
1452         return m;
1453 }
1454
1455 /* Btree node iterator */
1456
1457 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
1458                               struct btree *b,
1459                               const struct bkey_packed *k,
1460                               const struct bkey_packed *end)
1461 {
1462         if (k != end) {
1463                 struct btree_node_iter_set *pos;
1464
1465                 btree_node_iter_for_each(iter, pos)
1466                         ;
1467
1468                 BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
1469                 *pos = (struct btree_node_iter_set) {
1470                         __btree_node_key_to_offset(b, k),
1471                         __btree_node_key_to_offset(b, end)
1472                 };
1473         }
1474 }
1475
1476 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
1477                                struct btree *b,
1478                                const struct bkey_packed *k,
1479                                const struct bkey_packed *end)
1480 {
1481         __bch2_btree_node_iter_push(iter, b, k, end);
1482         bch2_btree_node_iter_sort(iter, b);
1483 }
1484
1485 noinline __flatten __attribute__((cold))
1486 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
1487                               struct btree *b, struct bpos *search)
1488 {
1489         struct bset_tree *t;
1490
1491         trace_bkey_pack_pos_fail(search);
1492
1493         for_each_bset(b, t)
1494                 __bch2_btree_node_iter_push(iter, b,
1495                         bch2_bset_search(b, t, search, NULL, NULL),
1496                         btree_bkey_last(b, t));
1497
1498         bch2_btree_node_iter_sort(iter, b);
1499 }
1500
1501 /**
1502  * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
1503  * given position
1504  *
1505  * Main entry point to the lookup code for individual btree nodes:
1506  *
1507  * NOTE:
1508  *
1509  * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
1510  * keys. This doesn't matter for most code, but it does matter for lookups.
1511  *
1512  * Some adjacent keys with a string of equal keys:
1513  *      i j k k k k l m
1514  *
1515  * If you search for k, the lookup code isn't guaranteed to return you any
1516  * specific k. The lookup code is conceptually doing a binary search and
1517  * iterating backwards is very expensive so if the pivot happens to land at the
1518  * last k that's what you'll get.
1519  *
1520  * This works out ok, but it's something to be aware of:
1521  *
1522  *  - For non extents, we guarantee that the live key comes last - see
1523  *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
1524  *    see will only be deleted keys you don't care about.
1525  *
1526  *  - For extents, deleted keys sort last (see the comment at the top of this
1527  *    file). But when you're searching for extents, you actually want the first
1528  *    key strictly greater than your search key - an extent that compares equal
1529  *    to the search key is going to have 0 sectors after the search key.
1530  *
1531  *    But this does mean that we can't just search for
1532  *    bkey_successor(start_of_range) to get the first extent that overlaps with
1533  *    the range we want - if we're unlucky and there's an extent that ends
1534  *    exactly where we searched, then there could be a deleted key at the same
1535  *    position and we'd get that when we search instead of the preceding extent
1536  *    we needed.
1537  *
1538  *    So we've got to search for start_of_range, then after the lookup iterate
1539  *    past any extents that compare equal to the position we searched for.
1540  */
1541 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
1542                                struct btree *b, struct bpos *search)
1543 {
1544         struct bset_tree *t;
1545         struct bkey_packed p, *packed_search = NULL;
1546
1547         EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
1548         bset_aux_tree_verify(b);
1549
1550         memset(iter, 0, sizeof(*iter));
1551
1552         switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
1553         case BKEY_PACK_POS_EXACT:
1554                 packed_search = &p;
1555                 break;
1556         case BKEY_PACK_POS_SMALLER:
1557                 packed_search = NULL;
1558                 break;
1559         case BKEY_PACK_POS_FAIL:
1560                 btree_node_iter_init_pack_failed(iter, b, search);
1561                 return;
1562         }
1563
1564         for_each_bset(b, t)
1565                 __bch2_btree_node_iter_push(iter, b,
1566                                            bch2_bset_search(b, t, search,
1567                                                             packed_search, &p),
1568                                            btree_bkey_last(b, t));
1569
1570         bch2_btree_node_iter_sort(iter, b);
1571 }
1572
1573 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
1574                                           struct btree *b)
1575 {
1576         struct bset_tree *t;
1577
1578         memset(iter, 0, sizeof(*iter));
1579
1580         for_each_bset(b, t)
1581                 __bch2_btree_node_iter_push(iter, b,
1582                                            btree_bkey_first(b, t),
1583                                            btree_bkey_last(b, t));
1584         bch2_btree_node_iter_sort(iter, b);
1585 }
1586
1587 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
1588                                                   struct btree *b,
1589                                                   struct bset_tree *t)
1590 {
1591         struct btree_node_iter_set *set;
1592
1593         btree_node_iter_for_each(iter, set)
1594                 if (set->end == t->end_offset)
1595                         return __btree_node_offset_to_key(b, set->k);
1596
1597         return btree_bkey_last(b, t);
1598 }
1599
1600 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
1601                                             struct btree *b,
1602                                             unsigned first)
1603 {
1604         bool ret;
1605
1606         if ((ret = (btree_node_iter_cmp(b,
1607                                         iter->data[first],
1608                                         iter->data[first + 1]) > 0)))
1609                 swap(iter->data[first], iter->data[first + 1]);
1610         return ret;
1611 }
1612
1613 void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
1614                                struct btree *b)
1615 {
1616         /* unrolled bubble sort: */
1617
1618         if (!__btree_node_iter_set_end(iter, 2)) {
1619                 btree_node_iter_sort_two(iter, b, 0);
1620                 btree_node_iter_sort_two(iter, b, 1);
1621         }
1622
1623         if (!__btree_node_iter_set_end(iter, 1))
1624                 btree_node_iter_sort_two(iter, b, 0);
1625 }
1626
1627 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
1628                                    struct btree_node_iter_set *set)
1629 {
1630         struct btree_node_iter_set *last =
1631                 iter->data + ARRAY_SIZE(iter->data) - 1;
1632
1633         memmove(&set[0], &set[1], (void *) last - (void *) set);
1634         *last = (struct btree_node_iter_set) { 0, 0 };
1635 }
1636
1637 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1638                                                   struct btree *b)
1639 {
1640         iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
1641
1642         EBUG_ON(iter->data->k > iter->data->end);
1643
1644         if (unlikely(__btree_node_iter_set_end(iter, 0))) {
1645                 bch2_btree_node_iter_set_drop(iter, iter->data);
1646                 return;
1647         }
1648
1649         if (__btree_node_iter_set_end(iter, 1))
1650                 return;
1651
1652         if (!btree_node_iter_sort_two(iter, b, 0))
1653                 return;
1654
1655         if (__btree_node_iter_set_end(iter, 2))
1656                 return;
1657
1658         btree_node_iter_sort_two(iter, b, 1);
1659 }
1660
1661 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
1662                                   struct btree *b)
1663 {
1664         if (btree_keys_expensive_checks(b)) {
1665                 bch2_btree_node_iter_verify(iter, b);
1666                 bch2_btree_node_iter_next_check(iter, b);
1667         }
1668
1669         __bch2_btree_node_iter_advance(iter, b);
1670 }
1671
1672 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
1673 {
1674         unsigned n = ARRAY_SIZE(iter->data);
1675
1676         while (n && __btree_node_iter_set_end(iter, n - 1))
1677                 --n;
1678
1679         return n;
1680 }
1681
1682 /*
1683  * Expensive:
1684  */
1685 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
1686                                                      struct btree *b,
1687                                                      unsigned min_key_type)
1688 {
1689         struct bkey_packed *k, *prev = NULL;
1690         struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
1691         struct btree_node_iter_set *set;
1692         struct bset_tree *t;
1693         unsigned end = 0;
1694
1695         bch2_btree_node_iter_verify(iter, b);
1696
1697         for_each_bset(b, t) {
1698                 k = bch2_bkey_prev_filter(b, t,
1699                         bch2_btree_node_iter_bset_pos(iter, b, t),
1700                         min_key_type);
1701                 if (k &&
1702                     (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
1703                         prev = k;
1704                         end = t->end_offset;
1705                 }
1706         }
1707
1708         if (!prev)
1709                 goto out;
1710
1711         /*
1712          * We're manually memmoving instead of just calling sort() to ensure the
1713          * prev we picked ends up in slot 0 - sort won't necessarily put it
1714          * there because of duplicate deleted keys:
1715          */
1716         btree_node_iter_for_each(iter, set)
1717                 if (set->end == end)
1718                         goto found;
1719
1720         BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
1721 found:
1722         BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
1723
1724         memmove(&iter->data[1],
1725                 &iter->data[0],
1726                 (void *) set - (void *) &iter->data[0]);
1727
1728         iter->data[0].k = __btree_node_key_to_offset(b, prev);
1729         iter->data[0].end = end;
1730 out:
1731         if (btree_keys_expensive_checks(b)) {
1732                 struct btree_node_iter iter2 = *iter;
1733
1734                 if (prev)
1735                         __bch2_btree_node_iter_advance(&iter2, b);
1736
1737                 while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
1738                         BUG_ON(k->type >= min_key_type);
1739                         __bch2_btree_node_iter_advance(&iter2, b);
1740                 }
1741         }
1742
1743         return prev;
1744 }
1745
1746 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
1747                                                  struct btree *b,
1748                                                  struct bkey *u)
1749 {
1750         struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
1751
1752         return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
1753 }
1754
1755 /* Mergesort */
1756
1757 void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
1758 {
1759         struct bset_tree *t;
1760
1761         for_each_bset(b, t) {
1762                 enum bset_aux_tree_type type = bset_aux_tree_type(t);
1763                 size_t j;
1764
1765                 stats->sets[type].nr++;
1766                 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
1767                         sizeof(u64);
1768
1769                 if (bset_has_ro_aux_tree(t)) {
1770                         stats->floats += t->size - 1;
1771
1772                         for (j = 1; j < t->size; j++)
1773                                 switch (bkey_float(b, t, j)->exponent) {
1774                                 case BFLOAT_FAILED_UNPACKED:
1775                                         stats->failed_unpacked++;
1776                                         break;
1777                                 case BFLOAT_FAILED_PREV:
1778                                         stats->failed_prev++;
1779                                         break;
1780                                 case BFLOAT_FAILED_OVERFLOW:
1781                                         stats->failed_overflow++;
1782                                         break;
1783                                 }
1784                 }
1785         }
1786 }
1787
1788 void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
1789                          struct bkey_packed *k)
1790 {
1791         struct bset_tree *t = bch2_bkey_to_bset(b, k);
1792         struct bkey_packed *l, *r, *p;
1793         struct bkey uk, up;
1794         char buf1[200], buf2[200];
1795         unsigned j, inorder;
1796
1797         if (out->pos != out->end)
1798                 *out->pos = '\0';
1799
1800         if (!bset_has_ro_aux_tree(t))
1801                 return;
1802
1803         inorder = bkey_to_cacheline(b, t, k);
1804         if (!inorder || inorder >= t->size)
1805                 return;
1806
1807         j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
1808         if (k != tree_to_bkey(b, t, j))
1809                 return;
1810
1811         switch (bkey_float(b, t, j)->exponent) {
1812         case BFLOAT_FAILED_UNPACKED:
1813                 uk = bkey_unpack_key(b, k);
1814                 pr_buf(out,
1815                        "    failed unpacked at depth %u\n"
1816                        "\t%llu:%llu\n",
1817                        ilog2(j),
1818                        uk.p.inode, uk.p.offset);
1819                 break;
1820         case BFLOAT_FAILED_PREV:
1821                 p = tree_to_prev_bkey(b, t, j);
1822                 l = is_power_of_2(j)
1823                         ? btree_bkey_first(b, t)
1824                         : tree_to_prev_bkey(b, t, j >> ffs(j));
1825                 r = is_power_of_2(j + 1)
1826                         ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
1827                         : tree_to_bkey(b, t, j >> (ffz(j) + 1));
1828
1829                 up = bkey_unpack_key(b, p);
1830                 uk = bkey_unpack_key(b, k);
1831                 bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
1832                 bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
1833
1834                 pr_buf(out,
1835                        "    failed prev at depth %u\n"
1836                        "\tkey starts at bit %u but first differing bit at %u\n"
1837                        "\t%llu:%llu\n"
1838                        "\t%llu:%llu\n"
1839                        "\t%s\n"
1840                        "\t%s\n",
1841                        ilog2(j),
1842                        bch2_bkey_greatest_differing_bit(b, l, r),
1843                        bch2_bkey_greatest_differing_bit(b, p, k),
1844                        uk.p.inode, uk.p.offset,
1845                        up.p.inode, up.p.offset,
1846                        buf1, buf2);
1847                 break;
1848         case BFLOAT_FAILED_OVERFLOW:
1849                 uk = bkey_unpack_key(b, k);
1850                 pr_buf(out,
1851                        "    failed overflow at depth %u\n"
1852                        "\t%llu:%llu\n",
1853                        ilog2(j),
1854                        uk.p.inode, uk.p.offset);
1855                 break;
1856         }
1857 }