]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 454bd4f82d bcachefs: Fix for the stripes mark path and gc
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1
2 #include "bcachefs.h"
3 #include "btree_cache.h"
4 #include "btree_io.h"
5 #include "btree_iter.h"
6 #include "btree_locking.h"
7 #include "debug.h"
8
9 #include <linux/prefetch.h>
10 #include <trace/events/bcachefs.h>
11
12 const char * const bch2_btree_ids[] = {
13 #define x(kwd, val, name) name,
14         BCH_BTREE_IDS()
15 #undef x
16         NULL
17 };
18
19 void bch2_recalc_btree_reserve(struct bch_fs *c)
20 {
21         unsigned i, reserve = 16;
22
23         if (!c->btree_roots[0].b)
24                 reserve += 8;
25
26         for (i = 0; i < BTREE_ID_NR; i++)
27                 if (c->btree_roots[i].b)
28                         reserve += min_t(unsigned, 1,
29                                          c->btree_roots[i].b->level) * 8;
30
31         c->btree_cache.reserve = reserve;
32 }
33
34 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
35 {
36         return max_t(int, 0, bc->used - bc->reserve);
37 }
38
39 static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
40 {
41         EBUG_ON(btree_node_write_in_flight(b));
42
43         kvpfree(b->data, btree_bytes(c));
44         b->data = NULL;
45         bch2_btree_keys_free(b);
46 }
47
48 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
49 {
50         struct btree_cache *bc = &c->btree_cache;
51
52         __btree_node_data_free(c, b);
53         bc->used--;
54         list_move(&b->list, &bc->freed);
55 }
56
57 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
58                                    const void *obj)
59 {
60         const struct btree *b = obj;
61         const u64 *v = arg->key;
62
63         return PTR_HASH(&b->key) == *v ? 0 : 1;
64 }
65
66 static const struct rhashtable_params bch_btree_cache_params = {
67         .head_offset    = offsetof(struct btree, hash),
68         .key_offset     = offsetof(struct btree, key.v),
69         .key_len        = sizeof(struct bch_extent_ptr),
70         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
71 };
72
73 static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
74 {
75         struct btree_cache *bc = &c->btree_cache;
76
77         b->data = kvpmalloc(btree_bytes(c), gfp);
78         if (!b->data)
79                 goto err;
80
81         if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
82                 goto err;
83
84         bc->used++;
85         list_move(&b->list, &bc->freeable);
86         return;
87 err:
88         kvpfree(b->data, btree_bytes(c));
89         b->data = NULL;
90         list_move(&b->list, &bc->freed);
91 }
92
93 static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
94 {
95         struct btree *b = kzalloc(sizeof(struct btree), gfp);
96         if (!b)
97                 return NULL;
98
99         bkey_btree_ptr_init(&b->key);
100         six_lock_init(&b->lock);
101         INIT_LIST_HEAD(&b->list);
102         INIT_LIST_HEAD(&b->write_blocked);
103
104         btree_node_data_alloc(c, b, gfp);
105         return b->data ? b : NULL;
106 }
107
108 /* Btree in memory cache - hash table */
109
110 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
111 {
112         rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
113
114         /* Cause future lookups for this node to fail: */
115         PTR_HASH(&b->key) = 0;
116 }
117
118 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
119 {
120         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
121                                              bch_btree_cache_params);
122 }
123
124 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
125                                 unsigned level, enum btree_id id)
126 {
127         int ret;
128
129         b->level        = level;
130         b->btree_id     = id;
131
132         mutex_lock(&bc->lock);
133         ret = __bch2_btree_node_hash_insert(bc, b);
134         if (!ret)
135                 list_add(&b->list, &bc->live);
136         mutex_unlock(&bc->lock);
137
138         return ret;
139 }
140
141 __flatten
142 static inline struct btree *btree_cache_find(struct btree_cache *bc,
143                                      const struct bkey_i *k)
144 {
145         return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
146                                       bch_btree_cache_params);
147 }
148
149 /*
150  * this version is for btree nodes that have already been freed (we're not
151  * reaping a real btree node)
152  */
153 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
154 {
155         struct btree_cache *bc = &c->btree_cache;
156         int ret = 0;
157
158         lockdep_assert_held(&bc->lock);
159
160         if (!six_trylock_intent(&b->lock))
161                 return -ENOMEM;
162
163         if (!six_trylock_write(&b->lock))
164                 goto out_unlock_intent;
165
166         if (btree_node_noevict(b))
167                 goto out_unlock;
168
169         if (!btree_node_may_write(b))
170                 goto out_unlock;
171
172         if (btree_node_dirty(b) &&
173             test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
174                 goto out_unlock;
175
176         if (btree_node_dirty(b) ||
177             btree_node_write_in_flight(b) ||
178             btree_node_read_in_flight(b)) {
179                 if (!flush)
180                         goto out_unlock;
181
182                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
183                                TASK_UNINTERRUPTIBLE);
184
185                 /*
186                  * Using the underscore version because we don't want to compact
187                  * bsets after the write, since this node is about to be evicted
188                  * - unless btree verify mode is enabled, since it runs out of
189                  * the post write cleanup:
190                  */
191                 if (verify_btree_ondisk(c))
192                         bch2_btree_node_write(c, b, SIX_LOCK_intent);
193                 else
194                         __bch2_btree_node_write(c, b, SIX_LOCK_read);
195
196                 /* wait for any in flight btree write */
197                 btree_node_wait_on_io(b);
198         }
199 out:
200         if (PTR_HASH(&b->key) && !ret)
201                 trace_btree_node_reap(c, b);
202         return ret;
203 out_unlock:
204         six_unlock_write(&b->lock);
205 out_unlock_intent:
206         six_unlock_intent(&b->lock);
207         ret = -ENOMEM;
208         goto out;
209 }
210
211 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
212 {
213         return __btree_node_reclaim(c, b, false);
214 }
215
216 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
217 {
218         return __btree_node_reclaim(c, b, true);
219 }
220
221 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
222                                            struct shrink_control *sc)
223 {
224         struct bch_fs *c = container_of(shrink, struct bch_fs,
225                                         btree_cache.shrink);
226         struct btree_cache *bc = &c->btree_cache;
227         struct btree *b, *t;
228         unsigned long nr = sc->nr_to_scan;
229         unsigned long can_free;
230         unsigned long touched = 0;
231         unsigned long freed = 0;
232         unsigned i;
233
234         if (btree_shrinker_disabled(c))
235                 return SHRINK_STOP;
236
237         /* Return -1 if we can't do anything right now */
238         if (sc->gfp_mask & __GFP_IO)
239                 mutex_lock(&bc->lock);
240         else if (!mutex_trylock(&bc->lock))
241                 return -1;
242
243         /*
244          * It's _really_ critical that we don't free too many btree nodes - we
245          * have to always leave ourselves a reserve. The reserve is how we
246          * guarantee that allocating memory for a new btree node can always
247          * succeed, so that inserting keys into the btree can always succeed and
248          * IO can always make forward progress:
249          */
250         nr /= btree_pages(c);
251         can_free = btree_cache_can_free(bc);
252         nr = min_t(unsigned long, nr, can_free);
253
254         i = 0;
255         list_for_each_entry_safe(b, t, &bc->freeable, list) {
256                 touched++;
257
258                 if (freed >= nr)
259                         break;
260
261                 if (++i > 3 &&
262                     !btree_node_reclaim(c, b)) {
263                         btree_node_data_free(c, b);
264                         six_unlock_write(&b->lock);
265                         six_unlock_intent(&b->lock);
266                         freed++;
267                 }
268         }
269 restart:
270         list_for_each_entry_safe(b, t, &bc->live, list) {
271                 touched++;
272
273                 if (freed >= nr) {
274                         /* Save position */
275                         if (&t->list != &bc->live)
276                                 list_move_tail(&bc->live, &t->list);
277                         break;
278                 }
279
280                 if (!btree_node_accessed(b) &&
281                     !btree_node_reclaim(c, b)) {
282                         /* can't call bch2_btree_node_hash_remove under lock  */
283                         freed++;
284                         if (&t->list != &bc->live)
285                                 list_move_tail(&bc->live, &t->list);
286
287                         btree_node_data_free(c, b);
288                         mutex_unlock(&bc->lock);
289
290                         bch2_btree_node_hash_remove(bc, b);
291                         six_unlock_write(&b->lock);
292                         six_unlock_intent(&b->lock);
293
294                         if (freed >= nr)
295                                 goto out;
296
297                         if (sc->gfp_mask & __GFP_IO)
298                                 mutex_lock(&bc->lock);
299                         else if (!mutex_trylock(&bc->lock))
300                                 goto out;
301                         goto restart;
302                 } else
303                         clear_btree_node_accessed(b);
304         }
305
306         mutex_unlock(&bc->lock);
307 out:
308         return (unsigned long) freed * btree_pages(c);
309 }
310
311 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
312                                             struct shrink_control *sc)
313 {
314         struct bch_fs *c = container_of(shrink, struct bch_fs,
315                                         btree_cache.shrink);
316         struct btree_cache *bc = &c->btree_cache;
317
318         if (btree_shrinker_disabled(c))
319                 return 0;
320
321         return btree_cache_can_free(bc) * btree_pages(c);
322 }
323
324 void bch2_fs_btree_cache_exit(struct bch_fs *c)
325 {
326         struct btree_cache *bc = &c->btree_cache;
327         struct btree *b;
328         unsigned i;
329
330         if (bc->shrink.list.next)
331                 unregister_shrinker(&bc->shrink);
332
333         mutex_lock(&bc->lock);
334
335 #ifdef CONFIG_BCACHEFS_DEBUG
336         if (c->verify_data)
337                 list_move(&c->verify_data->list, &bc->live);
338
339         kvpfree(c->verify_ondisk, btree_bytes(c));
340 #endif
341
342         for (i = 0; i < BTREE_ID_NR; i++)
343                 if (c->btree_roots[i].b)
344                         list_add(&c->btree_roots[i].b->list, &bc->live);
345
346         list_splice(&bc->freeable, &bc->live);
347
348         while (!list_empty(&bc->live)) {
349                 b = list_first_entry(&bc->live, struct btree, list);
350
351                 BUG_ON(btree_node_read_in_flight(b) ||
352                        btree_node_write_in_flight(b));
353
354                 if (btree_node_dirty(b))
355                         bch2_btree_complete_write(c, b, btree_current_write(b));
356                 clear_btree_node_dirty(b);
357
358                 btree_node_data_free(c, b);
359         }
360
361         while (!list_empty(&bc->freed)) {
362                 b = list_first_entry(&bc->freed, struct btree, list);
363                 list_del(&b->list);
364                 kfree(b);
365         }
366
367         mutex_unlock(&bc->lock);
368
369         if (bc->table_init_done)
370                 rhashtable_destroy(&bc->table);
371 }
372
373 int bch2_fs_btree_cache_init(struct bch_fs *c)
374 {
375         struct btree_cache *bc = &c->btree_cache;
376         unsigned i;
377         int ret = 0;
378
379         pr_verbose_init(c->opts, "");
380
381         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
382         if (ret)
383                 goto out;
384
385         bc->table_init_done = true;
386
387         bch2_recalc_btree_reserve(c);
388
389         for (i = 0; i < bc->reserve; i++)
390                 if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
391                         ret = -ENOMEM;
392                         goto out;
393                 }
394
395         list_splice_init(&bc->live, &bc->freeable);
396
397 #ifdef CONFIG_BCACHEFS_DEBUG
398         mutex_init(&c->verify_lock);
399
400         c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
401         if (!c->verify_ondisk) {
402                 ret = -ENOMEM;
403                 goto out;
404         }
405
406         c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
407         if (!c->verify_data) {
408                 ret = -ENOMEM;
409                 goto out;
410         }
411
412         list_del_init(&c->verify_data->list);
413 #endif
414
415         bc->shrink.count_objects        = bch2_btree_cache_count;
416         bc->shrink.scan_objects         = bch2_btree_cache_scan;
417         bc->shrink.seeks                = 4;
418         bc->shrink.batch                = btree_pages(c) * 2;
419         register_shrinker(&bc->shrink);
420 out:
421         pr_verbose_init(c->opts, "ret %i", ret);
422         return ret;
423 }
424
425 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
426 {
427         mutex_init(&bc->lock);
428         INIT_LIST_HEAD(&bc->live);
429         INIT_LIST_HEAD(&bc->freeable);
430         INIT_LIST_HEAD(&bc->freed);
431 }
432
433 /*
434  * We can only have one thread cannibalizing other cached btree nodes at a time,
435  * or we'll deadlock. We use an open coded mutex to ensure that, which a
436  * cannibalize_bucket() will take. This means every time we unlock the root of
437  * the btree, we need to release this lock if we have it held.
438  */
439 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
440 {
441         struct btree_cache *bc = &c->btree_cache;
442
443         if (bc->alloc_lock == current) {
444                 trace_btree_node_cannibalize_unlock(c);
445                 bc->alloc_lock = NULL;
446                 closure_wake_up(&bc->alloc_wait);
447         }
448 }
449
450 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
451 {
452         struct btree_cache *bc = &c->btree_cache;
453         struct task_struct *old;
454
455         old = cmpxchg(&bc->alloc_lock, NULL, current);
456         if (old == NULL || old == current)
457                 goto success;
458
459         if (!cl) {
460                 trace_btree_node_cannibalize_lock_fail(c);
461                 return -ENOMEM;
462         }
463
464         closure_wait(&bc->alloc_wait, cl);
465
466         /* Try again, after adding ourselves to waitlist */
467         old = cmpxchg(&bc->alloc_lock, NULL, current);
468         if (old == NULL || old == current) {
469                 /* We raced */
470                 closure_wake_up(&bc->alloc_wait);
471                 goto success;
472         }
473
474         trace_btree_node_cannibalize_lock_fail(c);
475         return -EAGAIN;
476
477 success:
478         trace_btree_node_cannibalize_lock(c);
479         return 0;
480 }
481
482 static struct btree *btree_node_cannibalize(struct bch_fs *c)
483 {
484         struct btree_cache *bc = &c->btree_cache;
485         struct btree *b;
486
487         list_for_each_entry_reverse(b, &bc->live, list)
488                 if (!btree_node_reclaim(c, b))
489                         return b;
490
491         while (1) {
492                 list_for_each_entry_reverse(b, &bc->live, list)
493                         if (!btree_node_write_and_reclaim(c, b))
494                                 return b;
495
496                 /*
497                  * Rare case: all nodes were intent-locked.
498                  * Just busy-wait.
499                  */
500                 WARN_ONCE(1, "btree cache cannibalize failed\n");
501                 cond_resched();
502         }
503 }
504
505 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
506 {
507         struct btree_cache *bc = &c->btree_cache;
508         struct btree *b;
509         u64 start_time = local_clock();
510
511         mutex_lock(&bc->lock);
512
513         /*
514          * btree_free() doesn't free memory; it sticks the node on the end of
515          * the list. Check if there's any freed nodes there:
516          */
517         list_for_each_entry(b, &bc->freeable, list)
518                 if (!btree_node_reclaim(c, b))
519                         goto out_unlock;
520
521         /*
522          * We never free struct btree itself, just the memory that holds the on
523          * disk node. Check the freed list before allocating a new one:
524          */
525         list_for_each_entry(b, &bc->freed, list)
526                 if (!btree_node_reclaim(c, b)) {
527                         btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
528                         if (b->data)
529                                 goto out_unlock;
530
531                         six_unlock_write(&b->lock);
532                         six_unlock_intent(&b->lock);
533                         goto err;
534                 }
535
536         b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
537         if (!b)
538                 goto err;
539
540         BUG_ON(!six_trylock_intent(&b->lock));
541         BUG_ON(!six_trylock_write(&b->lock));
542 out_unlock:
543         BUG_ON(btree_node_hashed(b));
544         BUG_ON(btree_node_write_in_flight(b));
545
546         list_del_init(&b->list);
547         mutex_unlock(&bc->lock);
548 out:
549         b->flags                = 0;
550         b->written              = 0;
551         b->nsets                = 0;
552         b->sib_u64s[0]          = 0;
553         b->sib_u64s[1]          = 0;
554         b->whiteout_u64s        = 0;
555         b->uncompacted_whiteout_u64s = 0;
556         bch2_btree_keys_init(b, &c->expensive_debug_checks);
557
558         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
559                                start_time);
560
561         return b;
562 err:
563         /* Try to cannibalize another cached btree node: */
564         if (bc->alloc_lock == current) {
565                 b = btree_node_cannibalize(c);
566                 list_del_init(&b->list);
567                 mutex_unlock(&bc->lock);
568
569                 bch2_btree_node_hash_remove(bc, b);
570
571                 trace_btree_node_cannibalize(c);
572                 goto out;
573         }
574
575         mutex_unlock(&bc->lock);
576         return ERR_PTR(-ENOMEM);
577 }
578
579 /* Slowpath, don't want it inlined into btree_iter_traverse() */
580 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
581                                 struct btree_iter *iter,
582                                 const struct bkey_i *k,
583                                 unsigned level,
584                                 enum six_lock_type lock_type,
585                                 bool sync)
586 {
587         struct btree_cache *bc = &c->btree_cache;
588         struct btree *b;
589
590         /*
591          * Parent node must be locked, else we could read in a btree node that's
592          * been freed:
593          */
594         BUG_ON(!btree_node_locked(iter, level + 1));
595         BUG_ON(level >= BTREE_MAX_DEPTH);
596
597         b = bch2_btree_node_mem_alloc(c);
598         if (IS_ERR(b))
599                 return b;
600
601         bkey_copy(&b->key, k);
602         if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
603                 /* raced with another fill: */
604
605                 /* mark as unhashed... */
606                 PTR_HASH(&b->key) = 0;
607
608                 mutex_lock(&bc->lock);
609                 list_add(&b->list, &bc->freeable);
610                 mutex_unlock(&bc->lock);
611
612                 six_unlock_write(&b->lock);
613                 six_unlock_intent(&b->lock);
614                 return NULL;
615         }
616
617         /*
618          * If the btree node wasn't cached, we can't drop our lock on
619          * the parent until after it's added to the cache - because
620          * otherwise we could race with a btree_split() freeing the node
621          * we're trying to lock.
622          *
623          * But the deadlock described below doesn't exist in this case,
624          * so it's safe to not drop the parent lock until here:
625          */
626         if (btree_node_read_locked(iter, level + 1))
627                 btree_node_unlock(iter, level + 1);
628
629         bch2_btree_node_read(c, b, sync);
630
631         six_unlock_write(&b->lock);
632
633         if (!sync) {
634                 six_unlock_intent(&b->lock);
635                 return NULL;
636         }
637
638         if (lock_type == SIX_LOCK_read)
639                 six_lock_downgrade(&b->lock);
640
641         return b;
642 }
643
644 /**
645  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
646  * in from disk if necessary.
647  *
648  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
649  *
650  * The btree node will have either a read or a write lock held, depending on
651  * the @write parameter.
652  */
653 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
654                                   const struct bkey_i *k, unsigned level,
655                                   enum six_lock_type lock_type,
656                                   bool may_drop_locks)
657 {
658         struct btree_cache *bc = &c->btree_cache;
659         struct btree *b;
660         struct bset_tree *t;
661
662         /*
663          * XXX: locking optimization
664          *
665          * we can make the locking looser here - caller can drop lock on parent
666          * node before locking child node (and potentially blocking): we just
667          * have to have bch2_btree_node_fill() call relock on the parent and
668          * return -EINTR if that fails
669          */
670         EBUG_ON(!btree_node_locked(iter, level + 1));
671         EBUG_ON(level >= BTREE_MAX_DEPTH);
672 retry:
673         rcu_read_lock();
674         b = btree_cache_find(bc, k);
675         rcu_read_unlock();
676
677         if (unlikely(!b)) {
678                 /*
679                  * We must have the parent locked to call bch2_btree_node_fill(),
680                  * else we could read in a btree node from disk that's been
681                  * freed:
682                  */
683                 b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
684
685                 /* We raced and found the btree node in the cache */
686                 if (!b)
687                         goto retry;
688
689                 if (IS_ERR(b))
690                         return b;
691         } else {
692                 /*
693                  * There's a potential deadlock with splits and insertions into
694                  * interior nodes we have to avoid:
695                  *
696                  * The other thread might be holding an intent lock on the node
697                  * we want, and they want to update its parent node so they're
698                  * going to upgrade their intent lock on the parent node to a
699                  * write lock.
700                  *
701                  * But if we're holding a read lock on the parent, and we're
702                  * trying to get the intent lock they're holding, we deadlock.
703                  *
704                  * So to avoid this we drop the read locks on parent nodes when
705                  * we're starting to take intent locks - and handle the race.
706                  *
707                  * The race is that they might be about to free the node we
708                  * want, and dropping our read lock on the parent node lets them
709                  * update the parent marking the node we want as freed, and then
710                  * free it:
711                  *
712                  * To guard against this, btree nodes are evicted from the cache
713                  * when they're freed - and PTR_HASH() is zeroed out, which we
714                  * check for after we lock the node.
715                  *
716                  * Then, bch2_btree_node_relock() on the parent will fail - because
717                  * the parent was modified, when the pointer to the node we want
718                  * was removed - and we'll bail out:
719                  */
720                 if (btree_node_read_locked(iter, level + 1))
721                         btree_node_unlock(iter, level + 1);
722
723                 if (!btree_node_lock(b, k->k.p, level, iter,
724                                      lock_type, may_drop_locks))
725                         return ERR_PTR(-EINTR);
726
727                 if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
728                              b->level != level ||
729                              race_fault())) {
730                         six_unlock_type(&b->lock, lock_type);
731                         if (bch2_btree_node_relock(iter, level + 1))
732                                 goto retry;
733
734                         trans_restart();
735                         trace_trans_restart_btree_node_reused(c,
736                                                 iter->trans->ip);
737                         return ERR_PTR(-EINTR);
738                 }
739         }
740
741         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
742                        TASK_UNINTERRUPTIBLE);
743
744         prefetch(b->aux_data);
745
746         for_each_bset(b, t) {
747                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
748
749                 prefetch(p + L1_CACHE_BYTES * 0);
750                 prefetch(p + L1_CACHE_BYTES * 1);
751                 prefetch(p + L1_CACHE_BYTES * 2);
752         }
753
754         /* avoid atomic set bit if it's not needed: */
755         if (btree_node_accessed(b))
756                 set_btree_node_accessed(b);
757
758         if (unlikely(btree_node_read_error(b))) {
759                 six_unlock_type(&b->lock, lock_type);
760                 return ERR_PTR(-EIO);
761         }
762
763         EBUG_ON(b->btree_id != iter->btree_id ||
764                 BTREE_NODE_LEVEL(b->data) != level ||
765                 bkey_cmp(b->data->max_key, k->k.p));
766
767         return b;
768 }
769
770 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
771                                           struct btree_iter *iter,
772                                           struct btree *b,
773                                           bool may_drop_locks,
774                                           enum btree_node_sibling sib)
775 {
776         struct btree *parent;
777         struct btree_node_iter node_iter;
778         struct bkey_packed *k;
779         BKEY_PADDED(k) tmp;
780         struct btree *ret = NULL;
781         unsigned level = b->level;
782
783         parent = btree_iter_node(iter, level + 1);
784         if (!parent)
785                 return NULL;
786
787         if (!bch2_btree_node_relock(iter, level + 1))
788                 goto out_upgrade;
789
790         node_iter = iter->l[parent->level].iter;
791
792         k = bch2_btree_node_iter_peek_all(&node_iter, parent);
793         BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
794
795         k = sib == btree_prev_sib
796                 ? bch2_btree_node_iter_prev(&node_iter, parent)
797                 : (bch2_btree_node_iter_advance(&node_iter, parent),
798                    bch2_btree_node_iter_peek(&node_iter, parent));
799         if (!k)
800                 goto out;
801
802         bch2_bkey_unpack(parent, &tmp.k, k);
803
804         ret = bch2_btree_node_get(c, iter, &tmp.k, level,
805                                   SIX_LOCK_intent, may_drop_locks);
806
807         if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
808                 struct btree_iter *linked;
809
810                 if (!bch2_btree_node_relock(iter, level + 1))
811                         goto out_upgrade;
812
813                 /*
814                  * We might have got -EINTR because trylock failed, and we're
815                  * holding other locks that would cause us to deadlock:
816                  */
817                 trans_for_each_iter(iter->trans, linked)
818                         if (btree_iter_cmp(iter, linked) < 0)
819                                 __bch2_btree_iter_unlock(linked);
820
821                 if (sib == btree_prev_sib)
822                         btree_node_unlock(iter, level);
823
824                 ret = bch2_btree_node_get(c, iter, &tmp.k, level,
825                                           SIX_LOCK_intent, may_drop_locks);
826
827                 /*
828                  * before btree_iter_relock() calls btree_iter_verify_locks():
829                  */
830                 if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
831                         btree_node_unlock(iter, level + 1);
832
833                 if (!bch2_btree_node_relock(iter, level)) {
834                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
835
836                         if (!IS_ERR(ret)) {
837                                 six_unlock_intent(&ret->lock);
838                                 ret = ERR_PTR(-EINTR);
839                         }
840                 }
841
842                 bch2_btree_trans_relock(iter->trans);
843         }
844 out:
845         if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
846                 btree_node_unlock(iter, level + 1);
847
848         bch2_btree_trans_verify_locks(iter->trans);
849
850         BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
851                (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
852                 !btree_node_locked(iter, level)));
853
854         if (!IS_ERR_OR_NULL(ret)) {
855                 struct btree *n1 = ret, *n2 = b;
856
857                 if (sib != btree_prev_sib)
858                         swap(n1, n2);
859
860                 BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
861                                                      n1->key.k.p),
862                                 n2->data->min_key));
863         }
864
865         return ret;
866 out_upgrade:
867         if (may_drop_locks)
868                 bch2_btree_iter_upgrade(iter, level + 2, true);
869         ret = ERR_PTR(-EINTR);
870         goto out;
871 }
872
873 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
874                               const struct bkey_i *k, unsigned level)
875 {
876         struct btree_cache *bc = &c->btree_cache;
877         struct btree *b;
878
879         BUG_ON(!btree_node_locked(iter, level + 1));
880         BUG_ON(level >= BTREE_MAX_DEPTH);
881
882         rcu_read_lock();
883         b = btree_cache_find(bc, k);
884         rcu_read_unlock();
885
886         if (b)
887                 return;
888
889         bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
890 }
891
892 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
893                              struct btree *b)
894 {
895         const struct bkey_format *f = &b->format;
896         struct bset_stats stats;
897
898         memset(&stats, 0, sizeof(stats));
899
900         bch2_btree_keys_stats(b, &stats);
901
902         pr_buf(out,
903                "l %u %llu:%llu - %llu:%llu:\n"
904                "    ptrs: ",
905                b->level,
906                b->data->min_key.inode,
907                b->data->min_key.offset,
908                b->data->max_key.inode,
909                b->data->max_key.offset);
910         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
911         pr_buf(out, "\n"
912                "    format: u64s %u fields %u %u %u %u %u\n"
913                "    unpack fn len: %u\n"
914                "    bytes used %zu/%zu (%zu%% full)\n"
915                "    sib u64s: %u, %u (merge threshold %zu)\n"
916                "    nr packed keys %u\n"
917                "    nr unpacked keys %u\n"
918                "    floats %zu\n"
919                "    failed unpacked %zu\n"
920                "    failed prev %zu\n"
921                "    failed overflow %zu\n",
922                f->key_u64s,
923                f->bits_per_field[0],
924                f->bits_per_field[1],
925                f->bits_per_field[2],
926                f->bits_per_field[3],
927                f->bits_per_field[4],
928                b->unpack_fn_len,
929                b->nr.live_u64s * sizeof(u64),
930                btree_bytes(c) - sizeof(struct btree_node),
931                b->nr.live_u64s * 100 / btree_max_u64s(c),
932                b->sib_u64s[0],
933                b->sib_u64s[1],
934                BTREE_FOREGROUND_MERGE_THRESHOLD(c),
935                b->nr.packed_keys,
936                b->nr.unpacked_keys,
937                stats.floats,
938                stats.failed_unpacked,
939                stats.failed_prev,
940                stats.failed_overflow);
941 }